Silhouette plots suggest optimal number of clusters is 2 so let’s see what results look like with only two clusters:

Steps of my analysis

Load all relevant packages:

library(dplyr) 
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats   1.0.0     ✔ readr     2.1.4
## ✔ ggplot2   3.5.1     ✔ stringr   1.5.1
## ✔ lubridate 1.9.2     ✔ tibble    3.2.1
## ✔ purrr     1.0.1     ✔ tidyr     1.3.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(lubridate)
library(stringr)
library(zoo)
## 
## Attaching package: 'zoo'
## 
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
library(ggplot2)
library(urbnmapr)
library(devtools)
## Loading required package: usethis
library(readxl)
library(spdep)
## Loading required package: spData
## To access larger datasets in this package, install the spDataLarge
## package with: `install.packages('spDataLarge',
## repos='https://nowosad.github.io/drat/', type='source')`
## Loading required package: sf
## Linking to GEOS 3.9.3, GDAL 3.5.2, PROJ 8.2.1; sf_use_s2() is TRUE
library(sp)
library(huge)
library(INLA)
## Loading required package: Matrix
## 
## Attaching package: 'Matrix'
## 
## The following objects are masked from 'package:tidyr':
## 
##     expand, pack, unpack
## 
## Loading required package: foreach
## 
## Attaching package: 'foreach'
## 
## The following objects are masked from 'package:purrr':
## 
##     accumulate, when
## 
## Loading required package: parallel
## This is INLA_22.12.16 built 2022-12-23 13:24:10 UTC.
##  - See www.r-inla.org/contact-us for how to get help.
library(HMMpa)
library(invgamma)
library(brinla)
library(reshape2)
## 
## Attaching package: 'reshape2'
## 
## The following object is masked from 'package:tidyr':
## 
##     smiths
library(patchwork)
library(jsonlite)
## 
## Attaching package: 'jsonlite'
## 
## The following object is masked from 'package:purrr':
## 
##     flatten
library(geosphere)
library(urbnmapr)
library(RAQSAPI)
## Use the function
## RAQSAPI::aqs_credentials(username, key)
## before using other RAQSAPI functions
## See ?RAQSAPI::aqs_credentials for more information
library(con2aqi)
library(pscl)
## Classes and Methods for R developed in the
## Political Science Computational Laboratory
## Department of Political Science
## Stanford University
## Simon Jackman
## hurdle and zeroinfl functions by Achim Zeileis
library(corrplot)
## corrplot 0.92 loaded
library(igraph)
## 
## Attaching package: 'igraph'
## 
## The following objects are masked from 'package:lubridate':
## 
##     %--%, union
## 
## The following objects are masked from 'package:purrr':
## 
##     compose, simplify
## 
## The following object is masked from 'package:tidyr':
## 
##     crossing
## 
## The following object is masked from 'package:tibble':
## 
##     as_data_frame
## 
## The following objects are masked from 'package:dplyr':
## 
##     as_data_frame, groups, union
## 
## The following objects are masked from 'package:stats':
## 
##     decompose, spectrum
## 
## The following object is masked from 'package:base':
## 
##     union
library(shapes)
## 
## Attaching package: 'shapes'
## 
## The following object is masked from 'package:igraph':
## 
##     V
library(scales)
## 
## Attaching package: 'scales'
## 
## The following object is masked from 'package:purrr':
## 
##     discard
## 
## The following object is masked from 'package:readr':
## 
##     col_factor

Loading and (quickly cleaning) all necessary datasets:

###SoA data
soa.data = read_xlsx("SoA.data.1019.xlsx")

###county_flips are unique identifier for counties
soa.data$county_fips = as.character(soa.data$county_fips) ##change it to character

#IMPORTANT

# This shape file contains the coordinates for county boundaries
##counties is from urbanmap

CA.counties = urbnmapr::counties %>% filter(state_abbv == "CA")

###IF WE WANT TO BOIL DOWN TIME SERIES AND KEEP ALL DATA, SWITCH to CA_newdata below
soa_joint <- left_join(CA.counties, soa.data, by = "county_fips")
## Warning in left_join(CA.counties, soa.data, by = "county_fips"): Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 1 of `x` matches multiple rows in `y`.
## ℹ Row 1 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
##   "many-to-many"` to silence this warning.

#Use with soa.data (full data)
CA_data = soa_joint %>% select(long, lat, county_name.y, Year, Score, Total_Pop,
                               EDUC_Lessthan9, EDUC_college, White_Collar,
                               Unemployment_Rate, Adj_HH_income, Income_Disparity,
                               Individuals_Below_Poverty, Median_Home_Value,
                               Median_Gross_Rent, Housing_No_Telephone,
                               Housing_Incomplete_Plumbing)

colnames(CA_data)[3] = "County"

CA_newdata = soa.data[1:58,]
CA_newdata = CA_newdata[,-c(4,7,8)]


###Cal-ViDa data
mortality = read.csv("respmortality1419.csv") #data from 2014-2019 bc we want to avoid COVID pandemic era
mortality = filter(mortality,Cause_of_Death %in% c("Chronic lower respiratory diseases","Influenza and pneumonia"))
mortality = mortality[,-c(1,4,9)]
Population = rep(100000,nrow(mortality))
mortality = cbind(mortality,Population)

Quick descriptions of SoA data variables:

Score: social deprivation index (SDI) score calculated from the following 11 subindices EDUC_Lessthan9: % of population older than 24 with less than 9 years of education EDUC_college: % of population older than 24 with at least four years of college education White_Collar: % of population older than 15 employed in a white collar occupation Unemployment_Rate: unemployment rate for population older than 15 Adj_HH_income: median household income adjusted for local housing costs Income_Disparity: an income disparity ratio Individuals_Below_Poverty: % of population below the federal poverty line Median_Home_Value: median home value for owned, occupied units Median_Gross_Rent: median gross rent for rented units Housing_No_Telephone: % of households without a telephone Housing_Incomplete_Plumbing: % of households with incomplete plumbing

Heatmap of population by county

#Initializing map and station locations
ca_map <- map_data("county", region = "california")

#Match population dataset with ca_map
#2010-2019 population data for CA 
USpops = read.csv("CA_census_pops1019.csv")
CApops = USpops %>% filter(STNAME == "California") %>% select(CTYNAME,POPESTIMATE2019)
CApops = CApops[-1,]

CApops$CTYNAME = unique(ca_map$subregion)
colnames(CApops) = c("subregion","pop")

merged_data <- merge(ca_map, CApops, by = "subregion", all.x = TRUE)

#Plot
gg_pop <- ggplot() +
  geom_polygon(data = merged_data, aes(x = long, y = lat, group = group, fill = pop), 
               color = "black") +
  coord_fixed(ratio = 1.3, xlim = c(-125, -112), ylim = c(30, 42)) +
  theme_void() +
  labs(title = "Heatmap of County Populations for 2019",fill = expression("Population")) +
  scale_fill_gradient(low = "yellow", high = "red")

print(gg_pop)

California state with county labels for reference:

SKATER clustering

The code below structures the dataframe to be fed into the spatial data frame (SPDF) object. Dimensions are 58 rows by 10 columns (each column is its own year)

###Setting up SPDF for CA counties 
CA_sf = st_read(getwd(),"CA_Counties_TIGER2016")
## Reading layer `CA_Counties_TIGER2016' from data source 
##   `C:\Users\jeffr\Desktop\Spatiotemporal + Causal Inference\Wildfire Paper 1 Code' 
##   using driver `ESRI Shapefile'
## Simple feature collection with 58 features and 17 fields
## Geometry type: MULTIPOLYGON
## Dimension:     XY
## Bounding box:  xmin: -13857270 ymin: 3832931 xmax: -12705030 ymax: 5162404
## Projected CRS: WGS 84 / Pseudo-Mercator
CA_spdf = as_Spatial(CA_sf)

# score_scaled = scale(CA_data$Score)

c_index = unique(CA_data$County)
y_index = unique(CA_data$Year)
SDI_df = matrix(nrow=10,ncol=58)
track1 = 1
track2 = 1

for (i in c_index){
  for (j in y_index){
    scores = CA_data %>% filter(County == i) %>% filter(Year == j) %>% select(Score) %>% unique()
    SDI_df[track1,track2] = scores$Score
    track1 = track1 + 1
  }
  track1 = 1
  track2 = track2 + 1
}

score_scaled = scale(SDI_df) #NEED TO SCALE THE DATA BEFORE FEEDING IT INTO SKATER

#covariates_scale = data.frame(apply(CA_data[,4:16],2,scale))
covariates_scale = data.frame(t(score_scaled))

CA_spdf@data = covariates_scale

Using the SPDF from above, we follow the steps of SKATER tutorial (https://www.dshkol.com/post/spatially-constrained-clustering-and-regionalization/) to generate three separate clustering results: (1) Unconstrained/default (2) Clusters have minimum population constraint based on the total population / # of clusters (3) Clusters are comprised of a minimum number of counties (8 for smaller number of clusters, 4 for bigger numbers)

#Identify neighborhood list for counties 
CA_nb = poly2nb(CA_spdf)

#summary(CA_nb)

# plot(CA_spdf, main = "With queen")
# plot(CA_nb, coords = coordinates(CA_spdf), col="blue", add = TRUE)

#Calculate edge costs (dissimilarity matrix) based on Euclidean distance 
costs <- nbcosts(CA_nb, data = covariates_scale)

###Get adjacency matrix using nb2mat() (SEPARATE STEP FOR INLA)
adj = nb2mat(CA_nb,style = "B")

#Style means the coding scheme style used to create the weighting matrix 
# B: basic binary coding scheme
# W: row standardized coding scheme 
# C: globally standardized coding scheme  
# U: values of C / number of neighbors 
# S: variance stabilizing coding scheme 

#Transform edge costs to spatial weights 
ct_w <- nb2listw(CA_nb,costs,style="B")

#Create minimum spanning tree 
ct_mst <- mstree(ct_w)

plot(ct_mst,coordinates(CA_spdf),col="blue", cex.lab=0.5)
plot(CA_spdf, add=TRUE)

#Run SKATER algorithm to get 5 contiguous clusters (cluster idx is in order of CA_sf)
clus2 <- skater(edges = ct_mst[,1:2], data = covariates_scale, ncuts = 1)

#Determine an appropriate minimum population threshold based on???
pops_summary = summary(unique(CA_data$Total_Pop))
pops_summary
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
##    12700    63275   219705   710796   750235 10105722
#Idea 1: Use median * (how many counties should be in a cluster at minimum)
min_pop = as.numeric(pops_summary[3] * 5)

#Idea 2: If we assume CA population is 39M, divide total pop by # clusters
min_pop2 = 39000000 / 2

#Add a min population constraint
clus2_min <- skater(edges = ct_mst[,1:2], 
                     data = covariates_scale, 
                     crit = min_pop2, 
                     vec.crit = CA_data$Total_Pop,
                     ncuts = 1)

#Add a minimum number of areas in each cluster constraint 
clus2_minarea = skater(edges = ct_mst[,1:2], data = covariates_scale, ncuts = 1,4)


CA_data_cluster = (CA_sf %>% mutate(clus = clus2_min$groups))

#Plot clustered CA
plot((CA_sf %>% mutate(clus = clus2$groups))['clus'], main = "2 cluster example")

plot((CA_sf %>% mutate(clus = clus2_min$groups))['clus'], main = "2 cluster example with population constraint")

plot((CA_sf %>% mutate(clus = clus2_minarea$groups))['clus'], main = "2 cluster example with minimum number of areas constraint")

#plot(CA_sf,col=c("red","green","blue","purple","yellow")[clus2_min$groups],max.plot=17)

For reference, here are the cluster labels for each county:

clusterlabels = data.frame(CA_data_cluster$NAME,clus2_min$groups)
names(clusterlabels) = c("counties","Cluster")

o = order(clusterlabels$counties)
clusterlabels = clusterlabels[o,]
rownames(clusterlabels) = NULL

clusterlabels
##           counties Cluster
## 1          Alameda       1
## 2           Alpine       1
## 3           Amador       1
## 4            Butte       2
## 5        Calaveras       1
## 6           Colusa       2
## 7     Contra Costa       1
## 8        Del Norte       2
## 9        El Dorado       2
## 10          Fresno       1
## 11           Glenn       2
## 12        Humboldt       2
## 13        Imperial       1
## 14            Inyo       1
## 15            Kern       1
## 16           Kings       1
## 17            Lake       1
## 18          Lassen       2
## 19     Los Angeles       1
## 20          Madera       1
## 21           Marin       1
## 22        Mariposa       1
## 23       Mendocino       2
## 24          Merced       1
## 25           Modoc       2
## 26            Mono       1
## 27        Monterey       1
## 28            Napa       1
## 29          Nevada       2
## 30          Orange       1
## 31          Placer       2
## 32          Plumas       2
## 33       Riverside       1
## 34      Sacramento       1
## 35      San Benito       1
## 36  San Bernardino       1
## 37       San Diego       1
## 38   San Francisco       1
## 39     San Joaquin       1
## 40 San Luis Obispo       1
## 41       San Mateo       1
## 42   Santa Barbara       1
## 43     Santa Clara       1
## 44      Santa Cruz       1
## 45          Shasta       2
## 46          Sierra       2
## 47        Siskiyou       2
## 48          Solano       1
## 49          Sonoma       2
## 50      Stanislaus       1
## 51          Sutter       2
## 52          Tehama       2
## 53         Trinity       2
## 54          Tulare       1
## 55        Tuolumne       1
## 56         Ventura       1
## 57            Yolo       2
## 58            Yuba       2
counties = clusterlabels$counties
num_clus = max(clusterlabels$Cluster)

HUGE graph estimation

This code chunk takes the cluster grouping from SKATER and aggregates the full dataframe from the SPDF (58x10) into a 10x2 matrix (10 time points x 2 clusters) to be fed into the graph estimation package HUGE. The data from each county in a given cluster is aggregated based on a population weighted mean.

HUGE uses glasso to estimate a graph structure based on the aggregated feature data which recall, is the SDI score (socioeconomic status) from the SoA. We use a grid of lambda values under 1 in order to ensure that some edges will be present in the estimates produced by HUGE. This decision is supported by the fact that partial correlations calculated via regression appear to be statistically significant. Based on simulation results, we believe that EBIC is a suitable criterion for choosing the best estimated graph in the huge.select() step.

#Aggregate feature vectors into one vector for each SKATER cluster
CA_cluster = data.frame(CA_sf$NAMELSAD,clus2_min$groups)
names(CA_cluster) = c("County","Cluster")
year = 2010:2019

CA_cluster = left_join(CA_cluster,CA_data,by = "County")

#Get weighted avg value for Score for each cluster for each year 
#Create new data matrix of aggregated feature vectors 
cluster_features = matrix(NA,nrow = 10,ncol = num_clus)

for (i in 1:num_clus){
  cluster = CA_cluster %>% filter(Cluster == i)

  for(j in 1:10){
    #Obtain a weighted mean based on population
    vec = cluster %>% filter(Year == year[j]) %>% select(Score,Total_Pop) %>% unique()
    cluster.pop = sum(vec$Total_Pop)
    cluster.popweights = vec$Total_Pop/cluster.pop
    cluster_features[j,i] = weighted.mean(vec$Score,cluster.popweights)
  }
}

#Graph learning w HUGE
out.glasso = huge(cluster_features,lambda = seq(0.95,0.05,by=-0.05),method="glasso")
## Conducting the graphical lasso (glasso) wtih lossless screening....in progress: 5%Conducting the graphical lasso (glasso) wtih lossless screening....in progress: 10%Conducting the graphical lasso (glasso) wtih lossless screening....in progress: 15%Conducting the graphical lasso (glasso) wtih lossless screening....in progress: 21%Conducting the graphical lasso (glasso) wtih lossless screening....in progress: 26%Conducting the graphical lasso (glasso) wtih lossless screening....in progress: 31%Conducting the graphical lasso (glasso) wtih lossless screening....in progress: 36%Conducting the graphical lasso (glasso) wtih lossless screening....in progress: 42%Conducting the graphical lasso (glasso) wtih lossless screening....in progress: 47%Conducting the graphical lasso (glasso) wtih lossless screening....in progress: 52%Conducting the graphical lasso (glasso) wtih lossless screening....in progress: 57%Conducting the graphical lasso (glasso) wtih lossless screening....in progress: 63%Conducting the graphical lasso (glasso) wtih lossless screening....in progress: 68%
## Conducting the graphical lasso (glasso)....done.                                          
glasso.stars = huge.select(out.glasso,criterion = "stars",stars.thresh = 0.1)
## Conducting Subsampling....in progress:5% Conducting Subsampling....in progress:10% Conducting Subsampling....in progress:15% Conducting Subsampling....in progress:20% Conducting Subsampling....in progress:25% Conducting Subsampling....in progress:30% Conducting Subsampling....in progress:35% Conducting Subsampling....in progress:40% Conducting Subsampling....in progress:45% Conducting Subsampling....in progress:50% Conducting Subsampling....in progress:55% Conducting Subsampling....in progress:60% Conducting Subsampling....in progress:65% Conducting Subsampling....in progress:70% Conducting Subsampling....in progress:75% Conducting Subsampling....in progress:80% Conducting Subsampling....in progress:85% Conducting Subsampling....in progress:90% Conducting Subsampling....in progress:95% Conducting Subsampling....in progress:100% Conducting Subsampling....done.                  
glasso.ric = huge.select(out.glasso,criterion = "ric")
## Conducting rotation information criterion (ric) selection....done
## Computing the optimal graph....done
glasso.ebic = huge.select(out.glasso,criterion = "ebic")
## Conducting extended Bayesian information criterion (ebic) selection....done
plot(glasso.stars)

plot(glasso.ric)

plot(glasso.ebic)

huge.est = glasso.ebic$refit
huge.est
##      [,1] [,2]
## [1,]    0    1
## [2,]    1    0
#Identify which clusters/nodes are the most connected on the graph i.e. has the most association with the other nodes 
degree_connectivity = data.frame(colSums(huge.est))
colnames(degree_connectivity) = "node_connections"
degree_connectivity = cbind(c(1:num_clus),degree_connectivity)

Plotting HUGE graph

# Convert the adjacency matrix to a graph object
g <- graph_from_adjacency_matrix(huge.est, mode = "undirected")

# Assign custom labels to vertices
V(g)$name <- c(1,2)

# Assign colors to vertices
V(g)$color <- c("red", "cyan")

# Plot the graph with labeled vertices
plot(g, vertex.label = V(g)$name, vertex.color = V(g)$color, vertex.size = 20)

Transforming estimated adjacency matrix to graph filter H

The code below takes the adjacency matrix estimated in the previous step and transforms it into a graph filter H. The steps are explained in Antonian et al 2019 (Gareth Peters’ paper). The cutoff transformation is applied to the eigenvalues of the graph Laplacian.

A = as.matrix(huge.est)
p = nrow(A)

#obtain graph Laplacian L
D = diag(p)
for (i in 1:p){
  d = sum(A[,i])
  D[i,i] = d
}

L = D - A

#eigendecomposition of L
Ldecomp = eigen(L)
U = as.matrix(Ldecomp$vectors)
Lambdas = Ldecomp$values

#test
#U %*% (diag(p)*Lambdas) %*% t(U)

#Function implementing cutoff tranform for eigenvalues 
cutoff.transform = function(lambdas,q){
  transformed = c()
  cutoff = quantile(lambdas,q)
  for (i in lambdas){
    if(i <= cutoff){
      transformed = c(transformed,1)
    }
    else{
      transformed = c(transformed,0)
    }
  }
  
  return(transformed)
}

#quantile(Lambdas,2/3)
transformed.L = cutoff.transform(Lambdas,2/3)
eta.L = diag(p)*transformed.L

#obtain graph filter
H = U %*% eta.L %*% t(U)
H
##      [,1] [,2]
## [1,]  0.5  0.5
## [2,]  0.5  0.5
gfilter_weight = norm((1/7)*H^2,type = "F")

Create a function to generate heatmap plots of matrices

matrix_heatmap= function(matrix,title = "",gradient_zones = c(0,0.5,0.999)){
  r = nrow(matrix)
  df = as_tibble(cbind(expand.grid(rev(seq_len(r)),seq_len(r)),c(matrix))) %>% setNames(c("row","col","value"))
  df$value[df$value == 1] = 0.999
  
  plot = ggplot(df,mapping = aes(x=row,y=col,fill=value)) + geom_tile() + 
    scale_fill_gradientn(colors = c("yellow","orange","red"),
                       values = rescale(gradient_zones),
                       limits = c(0, 0.99),
                       oob = squish) + ggtitle(title) + theme_void()
  return(plot)
}

Heatmap plot of graph filter

#Heatmap of resulting H 
corrplot(H, order = 'original', cl.pos = 'b', tl.pos = 'n',method = "color", col = COL1('YlOrRd',10),
         title = "Graph filter")

matrix_heatmap(H,title = "")

Downloading EPA data

library(tidyverse)
library(plyr)
## Warning: package 'plyr' was built under R version 4.2.2
## ------------------------------------------------------------------------------
## You have loaded plyr after dplyr - this is likely to cause problems.
## If you need functions from both plyr and dplyr, please load plyr first, then dplyr:
## library(plyr); library(dplyr)
## ------------------------------------------------------------------------------
## 
## Attaching package: 'plyr'
## The following object is masked from 'package:purrr':
## 
##     compact
## The following objects are masked from 'package:dplyr':
## 
##     arrange, count, desc, failwith, id, mutate, rename, summarise,
##     summarize
library(dplyr)
library(jsonlite)
library(lubridate)
library(ggplot2)
library(maps)
## Warning: package 'maps' was built under R version 4.2.3
## 
## Attaching package: 'maps'
## The following object is masked from 'package:plyr':
## 
##     ozone
## The following object is masked from 'package:purrr':
## 
##     map
library(mapdata)
## Warning: package 'mapdata' was built under R version 4.2.3
library(geosphere)
library(urbnmapr)
library(RAQSAPI)
library(con2aqi)

aqs_credentials("jeffreywu@ucsb.edu","copperheron86")

Get county and pollutant reference codes from EPA

#Get county codes 
counties_url = "https://aqs.epa.gov/data/api/list/countiesByState?email=jeffreywu@ucsb.edu&key=copperheron86&state=06&"

countycodes = fromJSON(counties_url)
countycodes = countycodes[[2]]
california_counties = countycodes$code
counties = countycodes$value_represented 

#Get parameter codes
parameters_url = "https://aqs.epa.gov/data/api/list/parametersByClass?email=jeffreywu@ucsb.edu&key=copperheron86&pc=CRITERIA"

parametercodes = fromJSON(parameters_url)
parametercodes = parametercodes[[2]]
parametercodes = parametercodes[-7,]

pollutants = data.frame(parametercodes$code)
labels = c("lead","co","so2","no2","o3","pm10","pm25")
pollutants = cbind(pollutants,labels)

Identifying a set of monitoring stations that are well distributed across CA

The goal here is to query all of the EPA measurements for all 7 pollutants and AQI from 2014-2019. The first step is to query all stations in California that collect measurements for at least one of the 7 pollutants.

FUNCTION THAT QUERIES STATION LOCATIONS FOR A GIVEN POLLUTANT

query_aqs_station_data <- function(param,year){
  start_date <- paste0(year, "0101")
  end_date <- paste0(year, "1231")
  
  url <- paste0("https://aqs.epa.gov/data/api/monitors/byState?email=jeffreywu@ucsb.edu&key=copperheron86&param=", param, "&bdate=", start_date, "&edate=", end_date, "&state=06")
  
  myData <- fromJSON(url)
  station_data = myData[[2]]
  
  return(station_data)
}

FOR EACH POLLUTANT, GRAB ALL MONITORING STATIONS FOR EACH YEAR

stations_url = "https://aqs.epa.gov/data/api/monitors/byState?email=jeffreywu@ucsb.edu&key=copperheron86&param=88101&bdate=20140101&edate=20141231&state=06"

stations = fromJSON(stations_url)
stations = stations[[2]]

station_data2014_pm2.5 = stations %>% select(latitude,longitude,site_number, local_site_name,county_code,county_name)

#Get monitoring station locations for each pollutant for each year (takes approx 3 min)
all_pollutants_station_data <- list()
for (year in 2014:2019){
  year_data <- lapply(pollutants, query_aqs_station_data, year = year)
  all_pollutants_station_data[[as.character(year)]] <- year_data
}


# 1 - Lead, 2 - Carbon monoxide (CO), 3 - Sulfure dioxide (SO2), 4 - Nitrogen dioxide (NO2)
# 5 - Ozone (O3), 6 - Total PM10, 7 - PM2.5

all_pollutants_station_2014data = all_pollutants_station_data[[1]]
all_pollutants_station_2015data = all_pollutants_station_data[[2]]
all_pollutants_station_2016data = all_pollutants_station_data[[3]]
all_pollutants_station_2017data = all_pollutants_station_data[[4]]
all_pollutants_station_2018data = all_pollutants_station_data[[5]]
all_pollutants_station_2019data = all_pollutants_station_data[[6]]

Identify subset of stations that has best spatial coverage wrt CA state

Starting off with a list of all the stations measuring each parameter/pollutant, we want to identify a subset of stations that measure each pollutant for each county. This overall set of stations (subset for each county combined together) should have a good spatial coverage of the state.

This is difficult because there is not a station in every county measuring each pollutant. So in order to identify a good set of stations to query data from, I first looked up the two largest cities in each county, based on population.

Lat/long for 2 biggest cities (based on population) in each county

citylats = c(37.8044,37.5485,38.7743,38.8071,38.3527,38.3488,39.7285,39.7596,
             38.1231,38.0678,39.2143,39.1546,37.9780,38.0049,41.7558,41.7548,
             38.9399,38.6688,36.7378,36.8252,39.7474,39.5243,40.8021,40.8665,
             32.7920,32.6789,37.3614,37.3855,35.3733,35.7688,36.3275,36.3008,
             38.9582,38.8080,40.4163,40.2840,34.0522,33.7701,36.9613,37.1230,
             37.9735,38.1074,37.4849,37.4320,39.4457,39.4096,37.3022,37.0583,
             41.4871,41.4099,37.6485,38.5149,36.6777,36.6149,38.2975,38.1749,
             39.3280,39.2191,33.8366,33.7455,38.7521,38.7907,39.9341,40.3063,
             33.9806,33.9425,38.5816,38.4088,36.8525,36.8125,34.1083,34.0922,
             32.7157,32.6401,37.7749,37.9780,37.9577,37.7396,35.2828,35.6369,
             37.6879,37.5630,34.9530,34.4208,37.3387,37.3688,36.9741,36.9102,
             40.5865,40.4482,39.6763,39.5595,41.7354,41.3099,38.1041,38.2492,
             38.4404,38.2324,37.6393,37.4946,39.1404,39.1165,40.1785,39.9277,
             40.7310,40.4156,36.3301,36.2077,38.0297,37.9829,34.1975,34.1706,
             38.5449,38.6785,39.1277,39.0954)

citylongs = c(122.2712,121.9886,119.8219,119.7960,120.9327,120.7741,121.8375,121.6219,
              120.8509,120.5385,122.0094,122.1494,122.0311,121.8058,124.2026,124.1580,
              119.9772,120.9872,119.7871,119.7029,122.1964,122.1936,124.1637,124.0828,
              115.5631,115.4989,118.3997,118.4105,119.0187,119.2471,119.6457,119.7829,
              122.6264,122.5583,120.6530,120.5394,118.2437,118.1937,120.0607,120.2602,
              122.5311,122.5697,119.9663,120.0985,123.8053,123.3556,120.4830,120.8499,
              120.5425,120.6791,118.9721,119.4768,121.6555,121.8221,122.2869,122.2608,
              120.1833,121.0611,117.9143,117.8677,121.2880,121.2358,120.8980,121.2319,
              117.3755,117.2297,121.4944,121.3716,121.4016,121.3658,117.2898,117.4350,
              117.1611,117.0842,122.4194,122.0311,121.2908,121.4260,120.6596,120.6545,
              122.4702,122.3255,120.4357,119.6982,121.8853,122.0363,122.0308,121.7569,
              122.3917,122.2978,120.2410,120.8277,122.6345,122.3106,122.2566,122.0405,
              122.7141,122.6367,120.9970,120.8460,121.6169,121.6380,122.2358,122.1792,
              122.9420,123.2100,119.2966,119.3473,119.9741,120.3822,119.1771,118.8376,
              121.7405,121.7733,121.5508,121.5522)
citylongs = -1*citylongs

Alameda: Oakland (429082) and Fremont

Alpine: Alpine Village (225) and Mesa Vista

Amador: Ione (8363) and Jackson

Butte: Chico (94776) and Paradise

Calaveras: Rancho Calaveras (5324) and Angels Camp

Colusa: Colusa (5911) and Williams

Contra Costa: Concord (129688) and Antioch

Del Norte: Crescent City (6805) and Bertsch-Oceanview

El Dorado: South Lake Tahoe (22036) and Cameron Park

Fresno: Fresno (530093) and Clovis

Glenn: Orland (7644) and Willows

Humboldt: Eureka (26998) and Arcata

Imperial: El Centro (44120) and Calexico

Inyo: Bishop (3746) and Dixon Lane-Meadow Creek

Kern: Bakersfield (383579) and Delano

Kings: Hanford (56910) and Lemoore

Lake: Clearlake (15384) and Hidden Valley Lake

Lassen: Susanville (15165) and Janesville

Los Angeles: Los Angeles (3990000) and Long Beach

Madera: Madera (65706) and Chowchilla

Marin: San Rafael (58704) and Novato

Mariposa: Mariposa (1526) and Catheys Valley

Mendocino: Fort Bragg (7359) and Willits

Merced: Merced (83316) and Los Banos

Modoc: Alturas (2509) and California Pines

Mono: Mammoth Lakes (8127) and Walker

Monterey: Salinas (156259) and Seaside

Napa: Napa (79263) and American Canyon

Nevada: Truckee (16561) and Grass Valley

Orange: Anaheim (352005) and Santa Ana

Placer: Roseville (139117) and Rocklin

Plumas: East Quincy (2489) and Chester

Riverside: Riverside (330063) and Moreno Valley

Sacramento: Sacramento (508529) and Elk Grove

San Benito: Hollister (39749) and Ridgemark

San Bernandino: San Bernandino (215941) and Fontana

San Diego: San Diego (1426000) and Chula Vista

San Francisco: San Francisco (810000) and Concord

San Joaquin: Stockton (311178) and Tracy

San Luis Obispo: San Luis Obispo (47446) and Paso Robles

San Mateo: Daly City (107008) and San Mateo

Santa Barbara: Santa Maria (107408) and Santa Barbara

Santa Clara: San Jose (1030000) and Sunnyvale

Santa Cruz: Santa Cruz (64725) and Watsonville

Shasta: Redding (91772) and Anderson

Sierra: Loyalton (700) and Downieville

Siskiyou: Yreka (7556) and Mount Shasta

Solano: Vallejo (121913) and Fairfield

Sonoma: Santa Rosa (177586) and Petaluma

Stanislaus: Modesto (215030) and Turlock

Sutter: Yuba City and South Yuba City

Tehama: Red Bluff (14283) and Corning

Trinity: Weaverville (3667) and Post Mountain

Tulare: Visalia (133800) and Tulare

Tuolumne: Phoenix Lake-Cedar Ridge (5108) and Sonora

Ventura: Oxnard (209877) and Thousand Oaks

Yolo: Davis (69289) and Woodland

Yuba: Linda (17773) and Olivehurst

Then, I created the function below to choose a group of stations that are within a certain distance (Haversine distance from the latitude and longitude) of each city that I identified in the previous step. If there are less than 5 stations associated to a given city, the distance threshold (which starts at 100km) is increased by 50km.

FUNCTION THAT SELECTS SET OF STATIONS CLOSEST TO A GIVEN LAT/LONG

# Function to filter stations based on spatial coverage
subset_stations_by_spatial_coverage <- function(station_data, reference_lat, reference_lon, max_distance_km=100) {
  # Calculate distances between stations and reference location
  distances <- distHaversine(
    cbind(station_data$longitude, station_data$latitude),
    c(reference_lon, reference_lat)
  )
  distances <- distances/1000
  
  # idx =  which(distances == min(distances))
  # #Identify station within min distance to centroid of county
  # station_data_subset <- station_data[idx, ]
  
  # Subset stations within the specified max_distance_km
  idx = which(distances <= max_distance_km)
  station_data_subset <- station_data[idx, ]
  station_data_subset <- cbind(station_data_subset,distances[idx])
  
  while (nrow(station_data_subset) < 5){
    max_distance_km = max_distance_km + 50
    station_data_subset = subset_stations_by_spatial_coverage(station_data, 
                              reference_lat, reference_lon, max_distance_km)
  }
  
  return(station_data_subset)
}

# # Construct the subset of stations based on spatial coverage criteria (test)
# reference_lat = citylats[3]
# reference_lon = citylongs[3]
# max_distance_km = 100
# 
# subset_stations <- subset_stations_by_spatial_coverage(station_data2014_pm2.5, reference_lat, reference_lon, max_distance_km)
# 
# # Print the subset of stations
# print(subset_stations)
# Obtain centroid lat/longs for each county 
CA.counties2 = read.csv("counties.ca.data.csv")
ca.coordinates = data.frame(CA.counties2$county,CA.counties2$lat,CA.counties2$lng)
colnames(ca.coordinates) = c("county","lat","long")

ca.coordinates = ca.coordinates[order(ca.coordinates$county),]
row.names(ca.coordinates) = NULL

IMPORTANT FUNCTION:

Given a dataset containing station locations/codes for a given pollutant and year, the function below selects 5-20 stations that are closest to the lat/longs for the two biggest cities in each county and puts the station information (code, lat, long, etc) into a dataframe.

#Function that finds best monitoring station for each county for a specific pollutant for a specific year
# 1 - Lead, 2 - Carbon monoxide (CO), 3 - Sulfure dioxide (SO2), 4 - Nitrogen dioxide (NO2)
# 5 - Ozone (O3), 6 - Total PM10, 7 - PM2.5

best_stations = function(stationdata,pollutant){

  subset_list = list()
  
  #Load lat/longs for 58x2 cities into dataframe
  CA.coords = data.frame(rep(countycodes$value_represented,each = 2),citylats,citylongs)
  colnames(CA.coords) = c("County","Lat","Long")
  
  #Find closest station for each county centroid using subset_stations_by_spatial_coverage function
  for (i in 1:nrow(CA.coords)){
  reference_lat = CA.coords$Lat[i]
  reference_lon = CA.coords$Long[i]
  max_distance_km = 100
  
  subset_stations <- subset_stations_by_spatial_coverage(stationdata[[pollutant]], reference_lat, reference_lon, max_distance_km)
  subset_list[[i]] = subset_stations
  }
  
  #Combine pairs of city lists together 
  subset_list2 = list()
  sequence = seq(2,116,2)
  for(i in sequence){
    combine = rbind(subset_list[[i]],subset_list[[i-1]])
    subset_list2[[i-1]] = combine
  }
  subset_list2 =subset_list2[!sapply(subset_list2,is.null)]
  
  #Create a county label vector
  repnames = c()
  for(i in 1:58){
    repnames = c(repnames,nrow(subset_list2[[i]]))
  }  
  countylabels = rep(countycodes$value_represented,times = repnames)
  
  #Format the list into dataframe
  beststations = as.data.frame(do.call(rbind, subset_list2))
  beststations = cbind(countylabels,beststations$county_name,
                       beststations$`distances[idx]`,beststations)
  colnames(beststations)[c(1,2,3)] = c("measuring_county","station_county","distance_apart")
  rownames(beststations) = NULL
  
  return(beststations)
}

# #test cases
# pm2.5_stations_2014 = best_stations(all_pollutants_station_2014data,7)
# CO_stations_2016 = best_stations(all_pollutants_station_2016data,2)

CREATING BEST STATION LIST/DATAFRAME FOR EACH POLLUTANT, EACH ENTRY IS A YEAR

#Generate list for best stations for each pollutant for each year
Lead_stations = list()

Lead_stations[[1]] = best_stations(all_pollutants_station_2014data,1)
Lead_stations[[2]] = best_stations(all_pollutants_station_2015data,1)
Lead_stations[[3]] = best_stations(all_pollutants_station_2016data,1)
Lead_stations[[4]] = best_stations(all_pollutants_station_2017data,1)
Lead_stations[[5]] = best_stations(all_pollutants_station_2018data,1)
Lead_stations[[6]] = best_stations(all_pollutants_station_2019data,1)



CO_stations = list()

CO_stations[[1]] = best_stations(all_pollutants_station_2014data,2)
CO_stations[[2]] = best_stations(all_pollutants_station_2015data,2)
CO_stations[[3]] = best_stations(all_pollutants_station_2016data,2)
CO_stations[[4]] = best_stations(all_pollutants_station_2017data,2)
CO_stations[[5]] = best_stations(all_pollutants_station_2018data,2)
CO_stations[[6]] = best_stations(all_pollutants_station_2019data,2)



SO2_stations = list()

SO2_stations[[1]] = best_stations(all_pollutants_station_2014data,3)
SO2_stations[[2]] = best_stations(all_pollutants_station_2015data,3)
SO2_stations[[3]] = best_stations(all_pollutants_station_2016data,3)
SO2_stations[[4]] = best_stations(all_pollutants_station_2017data,3)
SO2_stations[[5]] = best_stations(all_pollutants_station_2018data,3)
SO2_stations[[6]] = best_stations(all_pollutants_station_2019data,3)



NO2_stations = list()

NO2_stations[[1]] = best_stations(all_pollutants_station_2014data,4)
NO2_stations[[2]] = best_stations(all_pollutants_station_2015data,4)
NO2_stations[[3]] = best_stations(all_pollutants_station_2016data,4)
NO2_stations[[4]] = best_stations(all_pollutants_station_2017data,4)
NO2_stations[[5]] = best_stations(all_pollutants_station_2018data,4)
NO2_stations[[6]] = best_stations(all_pollutants_station_2019data,4)



O3_stations = list()

O3_stations[[1]] = best_stations(all_pollutants_station_2014data,5)
O3_stations[[2]] = best_stations(all_pollutants_station_2015data,5)
O3_stations[[3]] = best_stations(all_pollutants_station_2016data,5)
O3_stations[[4]] = best_stations(all_pollutants_station_2017data,5)
O3_stations[[5]] = best_stations(all_pollutants_station_2018data,5)
O3_stations[[6]] = best_stations(all_pollutants_station_2019data,5)



PM10_stations = list()

PM10_stations[[1]] = best_stations(all_pollutants_station_2014data,6)
PM10_stations[[2]] = best_stations(all_pollutants_station_2015data,6)
PM10_stations[[3]] = best_stations(all_pollutants_station_2016data,6)
PM10_stations[[4]] = best_stations(all_pollutants_station_2017data,6)
PM10_stations[[5]] = best_stations(all_pollutants_station_2018data,6)
PM10_stations[[6]] = best_stations(all_pollutants_station_2019data,6)



# Lead.PM10_stations = list()
# 
# Lead.PM10_stations[[1]] = best_stations(all_pollutants_station_2014data,7)
# Lead.PM10_stations[[2]] = best_stations(all_pollutants_station_2015data,7)
# Lead.PM10_stations[[3]] = best_stations(all_pollutants_station_2016data,7)
# Lead.PM10_stations[[4]] = best_stations(all_pollutants_station_2017data,7)
# Lead.PM10_stations[[5]] = best_stations(all_pollutants_station_2018data,7)
# Lead.PM10_stations[[6]] = best_stations(all_pollutants_station_2019data,7)



PM2.5_stations = list()

PM2.5_stations[[1]] = best_stations(all_pollutants_station_2014data,7)
PM2.5_stations[[2]] = best_stations(all_pollutants_station_2015data,7)
PM2.5_stations[[3]] = best_stations(all_pollutants_station_2016data,7)
PM2.5_stations[[4]] = best_stations(all_pollutants_station_2017data,7)
PM2.5_stations[[5]] = best_stations(all_pollutants_station_2018data,7)
PM2.5_stations[[6]] = best_stations(all_pollutants_station_2019data,7)

Heatmap of population with station locations marked (2015)

Do the stations provide a good spatial coverage of California? To me, the coverage is reasonable especially because most of the northern and eastern counties are where most of the sparsely populated counties are located. There are probably not that many EPA stations there as a result.

pollutants1_2015 = readRDS("C:/Users/jeffr/Desktop/Spatiotemporal + Causal Inference/Wildfire Paper 1 Code/EPA data/Raw/pollutants1_2015_8.17.RData")
pollutants2_2015 = readRDS("C:/Users/jeffr/Desktop/Spatiotemporal + Causal Inference/Wildfire Paper 1 Code/EPA data/Raw/pollutants2_2015_8.17.RData")

stationlats = c(unique(pollutants1_2015$latitude),unique(pollutants2_2015$latitude))
stationlongs = c(unique(pollutants1_2015$longitude),unique(pollutants2_2015$longitude))

station_points = data.frame(stationlats,stationlongs)

#Plot
gg_pop_stations <- ggplot() +
  geom_polygon(data = merged_data, aes(x = long, y = lat, group = group, fill = pop), 
               color = "black") +
  coord_fixed(ratio = 1.3, xlim = c(-125, -112), ylim = c(30, 42)) +
  theme_void() +
  labs(title = "Heatmap of County Populations with Station Locations for 2015") +
  scale_fill_gradient(low = "lightblue", high = "darkblue")

# Add points
gg_pop_stations <- gg_pop_stations +
  geom_point(data = station_points, aes(x = stationlongs, y = stationlats), 
             color = "red", size = 1.5)

print(gg_pop_stations)

Downloading and aggregating air quality data using direct API calls

Given a set of 5-20 monitoring stations for each county, we loop through its station codes (for each year 2014-2019) and query using the EPA’s AQS function. This function only allows you to query a maximum of 4 parameters at once for a single year, so two calls to the function have to be made for each year (4 and 3).

(BELOW SHOWS IT BEING DONE FOR 2019)

END GOAL FINAL FORM: ONE BIG DATAFRAME (ALL POLLUTANTS ALL YEARS TOGETHER, USE FILTER TO SEPARATE)

# 1 - Lead, 2 - Carbon monoxide (CO), 3 - Sulfure dioxide (SO2), 4 - Nitrogen dioxide (NO2)
# 5 - Ozone (O3), 6 - Total PM10, 7 - PM2.5

stations2019x = rbind(Lead_stations[[6]],CO_stations[[6]],
             SO2_stations[[6]],NO2_stations[[6]])
stations2019y = rbind(O3_stations[[6]],PM10_stations[[6]],PM2.5_stations[[6]])

sitenums2019x = stations2019x %>% select(county_code,site_number) %>% unique() #198 stations
sitenums2019y = stations2019y %>% select(county_code,site_number) %>% unique() #178 stations


#Trying EPA R Package query (took 15 + 20 min!) gives us a dataframe 
ccodes = sitenums2019y$county_code
snums = sitenums2019y$site_number
str1 = "2019-01-01"
str2 = "2019-12-31"

pollutants1_2019 = aqs_dailysummary_by_site(parameter = c("14129","42101","42401","42602"),bdate = as.Date(str1),edate = as.Date(str2),stateFIPS = "06",countycode = ccodes,sitenum = snums)

pollutants2_2019 = aqs_dailysummary_by_site(parameter = c("44201","81102","88101"),bdate = as.Date(str1),edate = as.Date(str2),stateFIPS = "06",countycode = ccodes,sitenum = snums)

###SAVE LIST LOCALLY
saveRDS(pollutants2_2019,file = "C:/Users/jeffr/Desktop/Spatiotemporal + Causal Inference/Wildfire Paper 1 Code/EPA data/Raw/pollutants2_2019_8.18.RData")

After querying the raw data for each pollutant and each year, we want to go through it and remove any data from stations that have “bad” data. The standards that I set are that for a given year, a station should have at least 240 of the 365 days included for a year of data. Additionally, I applied a Hampel filter to identify outliers and if there are more than 14 consecutive outliers in the data i.e., days in a row with measurements that are abnormal, I considered the data from that station to not be suitable for inclusion in the final dataset.

NOTE: the daily values reported in the raw dataset are actually daily averages from periodic measurements made by the station throughout the day

After data from so called “bad” stations were removed, another function is applied which aggregates the daily observations into a monthly median The raw data is in the form of a single dataframe, which is fed into the important function raw_transform(). This function uses several functions to create a list of dataframes, one for each county, which represent the monthly median measurements for a certain pollutant in a given county in a given year.

QUALITY CHECK FUNCTION FOR STATION DATA: WANT TO ADDRESS OUTLIERS, MISSINGNESS

#Given a dataset like CO2016 (list of 1000ish stations), check for 2/3 missing data and for strings of outliers (14 in a row)

station_quality_check = function(station_data){
  l = length(station_data)
  badindex = c()
  consecutive_outliers = list()
  
  for (i in 1:l){
    aqi = station_data[[i]]$aqi
    pollutant_level = station_data[[i]]$arithmetic_mean
    
    #check for outliers in AQI
    median.aqi = median(na.omit(aqi))
    mad.aqi = mad(na.omit(aqi))
    
    min.aqi = median.aqi-(3*mad.aqi)
    max.aqi = median.aqi+(3*mad.aqi)
    
    outliers.aqi = which(aqi < min.aqi | aqi > max.aqi)
    
    result.aqi = rle(diff(outliers.aqi))

    
    #check for outliers in pollutant measure
    median.pollutant = median(na.omit(pollutant_level))
    mad.pollutant = mad(na.omit(pollutant_level))
    
    min.pollutant = median.pollutant-(3*mad.pollutant)
    max.pollutant = median.pollutant+(3*mad.pollutant)
    
    outliers.pollutant = which(pollutant_level < min.pollutant | pollutant_level > max.pollutant)
    
    result.pollutants = rle(diff(outliers.pollutant))
    
    
    if (nrow(station_data[[i]]) < 240){
      badindex = c(badindex,i)
    }
    
    else if (any(result.aqi$lengths >= 14 & result.aqi$values == 1) == TRUE){
      badindex = c(badindex,i)
    }
    
    else if (any(result.pollutants$lengths >= 14 & result.pollutants$values == 1) == TRUE){
      badindex = c(badindex,i)
    }
    
    consecutive_outliers[[i]] = c("AQI",outliers.aqi,"POLLUTANTS",outliers.pollutant)
  }
  
  bad_list = list(badindex,consecutive_outliers)
  
  return(bad_list)
}


#Test on CO2016 and CO2017
# station_quality_check(CO2016) #returns 61 "bad stations" out of 1230
# 
# removeidx = station_quality_check(SO22017)[[1]] #returns 715 out of 1056 "bad stations" 
# 
# test = SO22017[- removeidx]

FUNCTION THAT AGGREGATES DAILY DATA INTO MONTHYLY MEDIANS

monthly_agg = function(pollutantdata){
  #Aggregating all the station data at once
  date = ymd(pollutantdata$date_local)
  df2 <- pollutantdata                                   # Duplicate data
  df2$year_month <- floor_date(date,"month")  # Create year-month column
  df3 = df2 %>% select(county,site_number,arithmetic_mean,aqi,year_month) %>% as.data.frame()
  
  df3$arithmetic_mean = as.numeric(df3$arithmetic_mean)
  df3$aqi[which(df3$aqi == "NULL")] = NA
  df3$aqi = as.numeric(df3$aqi)
  
  df.agg = df3 %>% group_by(year_month) %>% dplyr::summarize(arithmetic_mean = median(na.omit(arithmetic_mean)),aqi = median(na.omit(aqi))) %>% as.data.frame()
  
  return(df.agg)
}

IMPORTANT FUNCTION: TRANSFORMING RAW DATA TO FINAL FORM

# Group 1: 14129 - Lead, 421012 - Carbon monoxide (CO), 42401 - Sulfure dioxide (SO2), 42602 - Nitrogen dioxide (NO2)
# Group 2: 44201 - Ozone (O3), 81102 - Total PM10, 88101 - PM2.5

raw_transform = function(rawdata,reference_list,standard){
  
  ###SEPARATE DF INTO A LIST OF DFs 
  
  matched_list = list()
  
  if(missing(standard)){
    for (i in 1:nrow(reference_list)){
    data = rawdata %>% filter(county_code == reference_list$county_code[i], site_number == reference_list$site_number[i])
  
    matched_list[[i]] = data
    }
  } else {
      for (i in 1:nrow(reference_list)){
      data = rawdata %>% filter(county_code == reference_list$county_code[i], site_number == reference_list$site_number[i],pollutant_standard == standard)

      matched_list[[i]] = data
    }
  }
  
  names(matched_list) = reference_list$measuring_county
  
  ###STATION QUALITY CHECK
  
  removeidx = station_quality_check(matched_list)[[1]]
  good_matched_list = matched_list[- removeidx]
  
  #Convert list back into one big dataframe
  temp = as.data.frame(do.call(rbind, good_matched_list)) #TOO MANY ROWS RIGHT?
  good_df = unique.data.frame(temp)
  
  
  ###MAKE A LIST OF COMBINED STATION DATA FOR EACH COUNTY
  mid_list = list()

  for (i in unique(reference_list$measuring_county)){
    
    df_new = data.frame(good_df[1,])
    subset = reference_list %>% filter(measuring_county == i) %>% select(county_code,site_number)
  
    for (j in 1:nrow(subset)){
      pull = good_df %>% filter(county_code == reference_list$county_code[j], site_number == reference_list$site_number[j])
      
      df_new = rbind(df_new,pull)
    }
    
    df_new = df_new[-1,]
    mid_list[[i]] = df_new
  }
  
  ###AGGREGATE DAILY DATA TO MONTHLY FOR EACH COUNTY
  
  final_list = lapply(mid_list,monthly_agg)
  
  return(final_list)
}

When assembling final datasets, note that certain pollutant standards are used bc they have values for AQI… the ones I used were:

Lead: Lead 3-Month 2009 ?? Has all NAs for AQI

CO: CO 8-hour 1971

SO2: SO2 1-hour 2010

NO2: NO2 1-hour 2010

O3: Ozone 8-hour 2015 ; sample duration should be 8 HR

PM10: PM10 24-hour 2006

PM2.5: PM25 24-hour 2012

APPLY RAW TRANSFORM FUNCTION TO ALL POLLUTANTS FOR ALL YEARS (BELOW SHOWS IT BEING DONE FOR 2019)

#Load raw data
pollutants1_2019 = readRDS("C:/Users/jeffr/Desktop/Spatiotemporal + Causal Inference/Wildfire Paper 1 Code/EPA data/Raw/pollutants1_2019_8.18.RData")
pollutants2_2019 = readRDS("C:/Users/jeffr/Desktop/Spatiotemporal + Causal Inference/Wildfire Paper 1 Code/EPA data/Raw/pollutants2_2019_8.18.RData")

Lead2019 = pollutants1_2019 %>% filter(parameter_code == "14129")
CO2019 = pollutants1_2019 %>% filter(parameter_code == "42101")
SO22019 = pollutants1_2019 %>% filter(parameter_code == "42401")
NO22019 = pollutants1_2019 %>% filter(parameter_code == "42602")
O32019 = pollutants2_2019 %>% filter(parameter_code == "44201")
PM102019 = pollutants2_2019 %>% filter(parameter_code == "81102")
PM2.52019 = pollutants2_2019 %>% filter(parameter_code == "88101")


Lead2019_final = raw_transform(rawdata = Lead2019,reference_list = Lead_stations[[6]],standard = "Lead 3-Month 2009")

CO2019_final = raw_transform(rawdata = CO2019,reference_list = CO_stations[[6]],standard = "CO 8-hour 1971")

SO22019_final = raw_transform(rawdata = SO22019,reference_list = SO2_stations[[6]],standard = "SO2 1-hour 2010")

NO22019_final = raw_transform(rawdata = NO22019,reference_list = NO2_stations[[6]],standard = "NO2 1-hour 2010")

O32019_final = raw_transform(rawdata = O32019,reference_list = O3_stations[[6]],standard = "Ozone 8-hour 2015")

PM102019_final = raw_transform(rawdata = PM102019,reference_list = PM10_stations[[6]],standard = "PM10 24-hour 2006") 

PM2.52019_final = raw_transform(rawdata = PM2.52019,reference_list = PM2.5_stations[[6]],standard = "PM25 24-hour 2012")

COMBINING EACH POLLUTANTS DATASET INTO A SINGLE DATAFRAME FOR THE YEAR (BELOW SHOWS IT BEING DONE FOR 2019)

###Combine final data into one dataframe for 2014 
test1 = as.data.frame(do.call(rbind, Lead2019_final))
test1 = cbind(test1,rep(pollutants$parametercodes.code[1],nrow(test1))) #maybe change parameter codes to 1-7?
colnames(test1) = c("Year-Month","Value","AQI","Pollutant")

test2 = as.data.frame(do.call(rbind, CO2019_final))
test2 = cbind(test2,rep(pollutants$parametercodes.code[2],nrow(test2))) #maybe change parameter codes to 1-7?
colnames(test2) = c("Year-Month","Value","AQI","Pollutant")

test3 = as.data.frame(do.call(rbind, SO22019_final))
test3 = cbind(test3,rep(pollutants$parametercodes.code[3],nrow(test3))) #maybe change parameter codes to 1-7?
colnames(test3) = c("Year-Month","Value","AQI","Pollutant")

test4 = as.data.frame(do.call(rbind, NO22019_final))
test4 = cbind(test4,rep(pollutants$parametercodes.code[4],nrow(test4))) #maybe change parameter codes to 1-7?
colnames(test4) = c("Year-Month","Value","AQI","Pollutant")

test5 = as.data.frame(do.call(rbind, O32019_final))
test5 = cbind(test5,rep(pollutants$parametercodes.code[5],nrow(test5))) #maybe change parameter codes to 1-7?
colnames(test5) = c("Year-Month","Value","AQI","Pollutant")

test6 = as.data.frame(do.call(rbind, PM102019_final))
test6 = cbind(test6,rep(pollutants$parametercodes.code[6],nrow(test6))) #maybe change parameter codes to 1-7?
colnames(test6) = c("Year-Month","Value","AQI","Pollutant")

test7 = as.data.frame(do.call(rbind, PM2.52019_final))
test7 = cbind(test7,rep(pollutants$parametercodes.code[7],nrow(test7))) #maybe change parameter codes to 1-7?
colnames(test7) = c("Year-Month","Value","AQI","Pollutant")

#Combine each pollutant dataset into one big dataset for the year
final_data19 = rbind(test1,test2,test3,test4,test5,test6,test7)

###SAVE FINAL DATASET LOCALLY
saveRDS(final_data19,file = "C:/Users/jeffr/Desktop/Spatiotemporal + Causal Inference/Wildfire Paper 1 Code/EPA data/Aggregated/final_data19_9.1.RData")

HAVE TO CLEAN DATA BEFORE FINALIZING:

Having combined the monthly medians for every county for a single year for each pollutant into a single dataframe, one final cleaning step must be performed before putting each years’ data together. The AQI value that is in each row corresponds to the AQI standardized measurement for that specific pollutant. Each pollutant has a different standardizing equation, but once they are all standardized as they are in the dataset and they can be compared against each other. The reported AQI measurement for a given day is just the maximum of the AQI values corresponding to each of the 7 pollutants. So the maximum AQI value (among 6 values bc Lead observations never have AQI values) was found for each month and that value was set as the actual AQI value for that month in all corresponding rows.

FIND MAX AQI (AMONG THE 7 POLLUTANTS) FOR EACH MONTH -> SET AS ACTUAL AQI FOR THAT MONTH

(BELOW SHOWS IT BEING DONE FOR 2019)

#Do for each year 
months = c("01","02","03","04","05","06","07","08","09","10","11","12")

###Do for each year 
for (i in 1:58){
  idx1 = which(stringr::str_starts(rownames(final_data19), counties[i]))
  subset1 = final_data19[idx1,]
  subset1$`Year-Month`= as.Date(subset1$`Year-Month`)
  
  for (j in months){ 
  #Filter by county and date
    date = paste0("2019-",j,"-01")
    date = as.Date(date)
    subset2 = subset1 %>% filter(`Year-Month` == as.Date(date)) 
    
    trueAQI = max(na.omit(subset2$AQI))
    
    idx2 = which(subset1$`Year-Month` == date)
    subset1$AQI[idx2] = trueAQI
  }
  
  final_data19[idx1,] = subset1
}

###SAVE FINAL DATASET LOCALLY
saveRDS(final_data19,file = "C:/Users/jeffr/Desktop/Spatiotemporal + Causal Inference/Wildfire Paper 1 Code/EPA data/Aggregated/final_data19_9.1.RData")

COMBINING EACH YEARS DATASET INTO ONE BIG TIDY DATAFRAME FOR AIR QUALITY COVARIATES

FILL IN YOUR OWN FILE DIRECTORIES HERE! START WORKING W ACTUAL EPA DATA FROM HERE ON

final_data14 = readRDS("C:/Users/jeffr/Desktop/Spatiotemporal + Causal Inference/Wildfire Paper 1 Code/EPA data/Aggregated/final_data14_9.1.RData")
final_data15 = readRDS("C:/Users/jeffr/Desktop/Spatiotemporal + Causal Inference/Wildfire Paper 1 Code/EPA data/Aggregated/final_data15_9.1.RData")
final_data16 = readRDS("C:/Users/jeffr/Desktop/Spatiotemporal + Causal Inference/Wildfire Paper 1 Code/EPA data/Aggregated/final_data16_9.1.RData")
final_data17 = readRDS("C:/Users/jeffr/Desktop/Spatiotemporal + Causal Inference/Wildfire Paper 1 Code/EPA data/Aggregated/final_data17_9.1.RData")
final_data18 = readRDS("C:/Users/jeffr/Desktop/Spatiotemporal + Causal Inference/Wildfire Paper 1 Code/EPA data/Aggregated/final_data18_9.1.RData")
final_data19 = readRDS("C:/Users/jeffr/Desktop/Spatiotemporal + Causal Inference/Wildfire Paper 1 Code/EPA data/Aggregated/final_data19_9.1.RData")

final_EPA_data = rbind(final_data14,final_data15,final_data16,final_data17,
                       final_data18,final_data19)
saveRDS(final_EPA_data,file = "C:/Users/jeffr/Desktop/Spatiotemporal + Causal Inference/Wildfire Paper 1 Code/EPA data/Final/final_EPA_data_9.1.RData")


final_EPA_data = readRDS("C:/Users/jeffr/Desktop/Spatiotemporal + Causal Inference/Wildfire Paper 1 Code/EPA data/Final/final_EPA_data_9.1.RData")
head(final_EPA_data)
##           Year-Month  Value AQI Pollutant
## Alameda.1 2014-01-01 0.0500  60     14129
## Alameda.2 2014-02-01 0.0520  28     14129
## Alameda.3 2014-03-01 0.0655  34     14129
## Alameda.4 2014-04-01 0.0360  40     14129
## Alameda.5 2014-05-01 0.0300  38     14129
## Alameda.6 2014-06-01 0.0230  37     14129

Adding cluster labels to the EPA data and aggregating based on clusters

Once the EPA data is in the correct format (one big dataframe) at the county level, we now need to aggregate it to the cluster level according to the SKATER cluster labels from before. A population weighted mean was used to aggregate here as well

Cluster = rep(1,length(final_EPA_data))
final_EPA_agg_data = cbind(final_EPA_data,Cluster)

for (i in 1:58){
  idx = which(stringr::str_starts(rownames(final_EPA_agg_data), counties[i]))
  final_EPA_agg_data$Cluster[idx] = clusterlabels$Cluster[i]
}

Time = c(rep(c(1:12),58),rep(13:24,58),rep(25:36,58),rep(37:48,58),rep(49:60,58),rep(61:72,58)) 
Time = rep(Time,7)
final_EPA_agg_data = cbind(Time,final_EPA_agg_data)

AGGREGATE CLUSTER DATA AND COMBINE INTO ONE DATAFRAME

countypops = CA_data %>% filter(Year > 2013) %>% select(Total_Pop,County,Year) %>% unique()
countypops = cbind(countypops,Cluster = rep(clusterlabels$Cluster,each=6))
countypops$County = rep(counties,each=6)

temp_EPA_agg_data = data.frame(final_EPA_agg_data[1,-2])

for (k in 1:num_clus){
  
  EPA_clus_k = data.frame(final_EPA_agg_data[1,-2])
  
  for (i in pollutants$parametercodes.code){
  pollutant_data = final_EPA_agg_data %>% filter(Pollutant == i)

  cluster_data = pollutant_data %>% filter(Cluster == k)
  cluster_data$Value = scale(cluster_data$Value)
  cluster_data$AQI = scale(cluster_data$AQI)
  year = 2014
  
  for(j in 1:72){
    cluster_data_j = cluster_data %>% filter(Time == j)
    cluster_counties = countypops %>% filter(Cluster == k,Year == year)
    
    pops = countypops %>% filter(Year == year,Cluster == k) %>% select(Total_Pop) 
    cluster.pop = sum(pops)
    cluster.popweights = pops/cluster.pop
    
    value_wmean = weighted.mean(cluster_data_j$Value,cluster.popweights$Total_Pop)
    aqi_wmean = weighted.mean(cluster_data_j$AQI,cluster.popweights$Total_Pop)
    insert = data.frame(Time = j,value_wmean,aqi_wmean,
                        Pollutant = i,Cluster = k)
    colnames(insert) = colnames(EPA_clus_k)
    
    EPA_clus_k = rbind(EPA_clus_k,insert)
    
    if ((j>12) & (j<25)){
      year = 2015
    }
    
    else if ((j>24) & (j<37)){
      year = 2016
    }
    
    else if ((j>36) & (j<49)){
      year = 2017
    }
    
    else if ((j>48) & (j<61)){
      year = 2018
    }
    
    else if ((j>60) & (j<73)){
      year = 2019
    }
    
    else{
      year = 2014
    }
  }
  }
  
  EPA_clus_k = EPA_clus_k[-1,]
  trueAQI = EPA_clus_k$AQI[1:72]
  trueAQI = rep(trueAQI,7)
  EPA_clus_k$AQI = trueAQI
  rownames(EPA_clus_k) = NULL
  
  temp_EPA_agg_data = rbind(temp_EPA_agg_data,EPA_clus_k)
  
}

final_EPA_agg_data = temp_EPA_agg_data[-1,]

saveRDS(final_EPA_agg_data,file = "C:/Users/jeffr/Desktop/Spatiotemporal + Causal Inference/Wildfire Paper 1 Code/EPA data/Final/final_8.1_EPA_agg_data_10.26.RData")

final_EPA_agg_data = readRDS("C:/Users/jeffr/Desktop/Spatiotemporal + Causal Inference/Wildfire Paper 1 Code/EPA data/Final/final_8.1_EPA_agg_data_10.26.RData")

head(final_EPA_agg_data)
##   Time     Value         AQI Pollutant Cluster
## 1    1 3.4901051  2.65621838     14129       1
## 2    2 3.7109930 -1.23651722     14129       1
## 3    3 5.2019861 -0.42790716     14129       1
## 4    4 1.9438900  0.30556092     14129       1
## 5    5 1.2812264  0.07395346     14129       1
## 6    6 0.5081188 -0.26487801     14129       1

Plot time series of each EPA variable

library(astsa)
## Warning: package 'astsa' was built under R version 4.2.3
## 
## Attaching package: 'astsa'
## The following object is masked from 'package:maps':
## 
##     unemp
for (i in pollutants$parametercodes.code){
  for (j in 1:num_clus){
    EPA_clus_ts = final_EPA_agg_data %>% filter(Pollutant == i) %>% filter(Cluster == j)

    title = sprintf("Pollutant %s - Cluster %1.0f",i,j)
    tsplot(EPA_clus_ts$Value,main = title,xlab = "Time (months)",ylab = "Value")  
  }
}

It turns out that the time series for Lead was not stationary, so we need to detrend the data before using it

for (i in pollutants$parametercodes.code){
  for (j in 1:num_clus){
    EPA_clus_ts = final_EPA_agg_data %>% filter(Pollutant == i) %>% filter(Cluster == j)
    lead_idx = which(final_EPA_agg_data$Pollutant == "14129" & final_EPA_agg_data$Cluster == j)
    
    if (i == "14129"){
      detrended1 = c(0,diff(EPA_clus_ts$Value,lag=1))
      final_EPA_agg_data[lead_idx,2] = detrended1
      
      title = sprintf("Pollutant %s - Cluster %1.0f",i,j)
      tsplot(detrended1,main = title,xlab = "Time (months)",ylab = expression(nabla~Value))
    } else{
      title = sprintf("Pollutant %s - Cluster %1.0f",i,j)
      tsplot(EPA_clus_ts$Value,main = title,xlab = "Time (months)",ylab = "Value")  
    }
  }
}

for (k in 1:num_clus){
  AQI_clus_ts = final_EPA_agg_data %>% filter(Pollutant == "14129") %>% filter(Cluster == k)
  title = sprintf("AQI - Cluster %1.0f",k)
  tsplot(AQI_clus_ts$AQI,main = title,xlab = "Time (months)",ylab = "Value")
}

Building covariate matrices to be used for gram matrix calculations

The kernel structures presented below reflect the findings of Chen et al. (2019) that distributed lags and interaction effects should be considered when evaluating the effects of air pollutants. Additionally, we wanted these lagged and joint effects to also vary in time. So, instead of calculating true autoregressive (AR), distributed lag (DL), and interaction structures, which assume stationarity, we construct localized kernels which have dynamic localized conditional correlations through time.

EPA_cluster_list = list()

cluster_EPA_data = function(cluster){
  EPA_clus_ts = matrix(nrow=72)

  for (i in pollutants$parametercodes.code){
    covariate_ts = final_EPA_agg_data %>% filter(Pollutant == i) %>% filter(Cluster == cluster)
    EPA_clus_ts = cbind(EPA_clus_ts,covariate_ts$Value)
  }
  EPA_clus_ts[,1] = covariate_ts$AQI
  colnames(EPA_clus_ts) = c("AQI","Lead","CO","SO2","NO2","O3","PM10","PM2.5")
  
  return(EPA_clus_ts)
}

for (i in 1:num_clus){
  EPA_cluster_list[[i]] = cluster_EPA_data(i)
}

The first step is to decompose our covariate time series into its trend, seasonal, and residual components because we want to learn each kernel on a different component.

decompose_ts = function(EPA_ts){
  S_scaled = EPA_ts 
  colnames(S_scaled) = c("AQI","Lead","CO","SO2","NO2","O3","PM10","PM2.5")
  S_scaled_ts = ts(S_scaled, frequency = 12)
  
  # S_decomposed = decompose(S_scaled_ts)
  
  ts_decomposed <- lapply(colnames(S_scaled_ts), function(x) {stl(S_scaled_ts[, x], s.window = "periodic")})
  names(ts_decomposed) = colnames(S_scaled_ts)
  
  S_seasonal = ts_decomposed[[1]]$time.series[,1]
  S_trend = ts_decomposed[[1]]$time.series[,2]
  S_random = ts_decomposed[[1]]$time.series[,3]
  
  for (i in 2:8){
    S_seasonal = cbind(S_seasonal,ts_decomposed[[i]]$time.series[,1])
    S_trend = cbind(S_trend,ts_decomposed[[i]]$time.series[,2])
    S_random = cbind(S_random,ts_decomposed[[i]]$time.series[,3])
  }
  
  colnames(S_trend)= colnames(S_scaled_ts)
  colnames(S_seasonal)= colnames(S_scaled_ts)
  colnames(S_random)= colnames(S_scaled_ts)
  
  S_DL = S_seasonal + S_random
  colnames(S_DL)= colnames(S_scaled_ts)
  
  S_random_int = S_random[12:72,]
  S_random = S_random[13:72,]
  S_seasonal = S_seasonal[13:72,]
  S_trend = S_trend[13:72,]
  S_DL = S_DL[13:72,]
  
  S_DL = data.frame(S_DL)
  S_DL2 = matrix(nrow=60)
  dl = c(3,6,12)
  col_num = 2
  for (i in dl){
    for (j in 1:ncol(S_DL)){
      extract = S_DL[(72-59-i):(72-i),j]
      S_DL2 = cbind(S_DL2,extract)
      colnames(S_DL2)[col_num] = sprintf("B%1.0f-%s",i,colnames(S_DL)[j])
      col_num = col_num+1
    }
  }
  
  S_DL2 = S_DL2[,-1]
  S_DL_final = cbind(S_DL[13:72,],S_DL2)
  
  W = matrix(nrow=(nrow(S_random))^2)
  num_cols = ncol(S_random)
  # num_cols = ncol(S_DL_final) #for now, just calculate interaction pairs for actual covariates
  col_num = 2
  
  for (i in 1:num_cols){
    for (j in 1:num_cols){
      interaction_col = kronecker(S_random[,i],S_random[,j]) #replace S_random with S_DL_final for DL interactions 
      W = cbind(W,interaction_col)
      
      colnames(W)[col_num] = sprintf("%sx%s",colnames(S_scaled)[i],colnames(S_scaled)[j])
      col_num = col_num+1
    }
  }
  W = W[,-1]
  
  row1 = c()
  
  for (k in 1:ncol(S_random_int)){
    row1 = c(row1,S_random_int[2,k]*S_random_int[1,])
  }
  
  W2 = rbind(as.numeric(row1),W)
  W2 = W2[,-seq(1,64,by=9)] #need to change if we include DL interactions
  
  list = list(S_random,S_random_int,S_seasonal,S_DL,
              S_DL2,S_DL_final,S_trend,W2)
  names(list) = c("S_random","S_random_int","S_seasonal","S_DL",
              "S_DL2","S_DL_final","S_trend","W2")
  return(list)
}

decompose_clus1 = decompose_ts(EPA_cluster_list[[1]])
decompose_clus2 = decompose_ts(EPA_cluster_list[[2]])

decomposed_cluster_data = list(decompose_clus1,decompose_clus2)

S_random_all = cbind(decompose_clus1$S_random,decompose_clus2$S_random)

S_DL_all = cbind(decompose_clus1$S_DL,decompose_clus2$S_DL)

W2_all = cbind(decompose_clus1$W2,decompose_clus2$W2)

Calculating autoregressive structure

Using the residual component of each time series, we construct a kernel that calculates the autocorrelation at one lag for all time points

Linear time invariant approach:

Let \(\sigma^2_{AR} = \gamma(0)\) be the variance of one of our time series, we can find ACVF and ACF from Yule-Walker as

\(\gamma(k) = a_1 \gamma(k-1) + a_2 \gamma(k-2) + ... + a_p \gamma(k-p)\)

\(\rho(k) = \frac{\gamma(k)}{\gamma(0)} = a_1 \rho(k-1) + a_2 \rho(k-2) + ... + a_p \rho(k-p)\)

AR_invariant_list = list()

for (c in 1:num_clus){
  #Grab S_random data for cluster c
  cluster_data = decomposed_cluster_data[[c]]
  S_random_clus = cluster_data$S_random
  
  ar.corr.values = c()
  ar.cov.values = c()
  
  for (i in 1:ncol(S_random_clus)){
    var = var(S_random_clus[,i])
    fit.ar = ar(S_random_clus[,i],order.max = 1, aic = FALSE, method = "yule-walker")
    
    corr.ar1 = fit.ar$ar
    cov.ar1 = fit.ar$ar * var
    
    ar.corr.values = c(ar.corr.values,corr.ar1)
    ar.cov.values = c(ar.cov.values,cov.ar1)
  }
  
  for (j in 1:ncol(S_random_clus)){
    AR_invariant_covmatrix = diag(nrow(S_random_clus))
    
    AR_invariant_covmatrix[row(AR_invariant_covmatrix) == col(AR_invariant_covmatrix) - 1] = ar.cov.values[j]
    AR_invariant_covmatrix[row(AR_invariant_covmatrix) == col(AR_invariant_covmatrix) + 1] = ar.cov.values[j]
  }
  
  AR_invariant = diag(nrow(S_random_clus))
  AR_invariant[row(AR_invariant) == col(AR_invariant) - 1] = (1/length(ar.cov.values))*sum(ar.cov.values)
  AR_invariant[row(AR_invariant) == col(AR_invariant) + 1] = (1/length(ar.cov.values))*sum(ar.cov.values)
  
  # corrplot(AR_invariant, is.corr = FALSE, method = 'color', tl.cex = 0.5, title = "Invariant AR 1 Covariance Structure")
  
  matrix_heatmap(AR_invariant,title = "Invariant AR 1 Covariance Structure")
  
  AR_invariant_list[[c]] = AR_invariant
}

K_AR_invariant = matrix(0,nrow=60,ncol=60)

for(i in 1:num_clus){
  K_AR_invariant = K_AR_invariant + ((1/num_clus)*AR_invariant_list[[i]])
}

# corrplot(K_AR_invariant, is.corr = FALSE, method = 'color', tl.cex = 0.5, title = "Invariant AR 1 Covariance Structure")

matrix_heatmap(K_AR_invariant,title = "Invariant AR 1 Covariance Structure")

K_AR_cluster = list()
K_AR_periodic_cluster = list()

for (c in 1:num_clus){
  
  #Grab S_random data for cluster c
  cluster_data = decomposed_cluster_data[[c]]
  S_random_clus = cluster_data$S_random
  
  #Create a list to contain covariance matrix for each pollutant (8)
  K_AR_list = list()
  K_AR_periodic_list = list()
  
  rho_AR = 1
  sigma2_AR = 1
  
  time_span = nrow(S_random_clus)
  
  #Calculate a AR 1 covariance matrix for each pollutant and store in list
  for (i in 1:8){
    ts =  S_random_clus[,i]
  
    K_covariate = matrix(nrow=time_span,ncol=time_span)
    K_covariate_periodic = matrix(nrow=time_span,ncol=time_span)
    
    for(j in 1:time_span){
      for (k in 1:time_span){
        if (abs(j-k) <= 1){
          
          K_covariate[j,k] = exp(- ((ts[j] - ts[k])^2) #RBF kernel 
                               / (2*rho_AR)) * sigma2_AR
          
          K_covariate_periodic[j,k] = exp(- ((ts[j] - ts[k])^2) #Locally periodic kernel 
                       / (2*rho_AR)) * exp(- (2*sin((abs(ts[j] - ts[k]))*3.14/12)^2)
                       / (rho_AR)) * sigma2_AR
        }
        else{
          K_covariate_periodic[j,k] = 0
          K_covariate[j,k] = 0
          }
      }
    }
    
    K_AR_list[[i]] = K_covariate
    K_AR_periodic_list[[i]] = K_covariate_periodic
  }
  
  names(K_AR_list) = colnames(S_random_clus)
  names(K_AR_periodic_list) = colnames(S_random_clus)
  
  #Add each pollutant's covariance matrix to get AR 1 matrix for each cluster
  K_AR = matrix(0,nrow=60,ncol=60)
  K_AR_periodic = matrix(0,nrow=60,ncol=60)
  
  for(i in 1:length(K_AR_periodic_list)){
    K_AR = K_AR + ((1/8)*K_AR_list[[i]])
    K_AR_periodic = K_AR_periodic + ((1/8)*K_AR_periodic_list[[i]])
  }
  
  K_AR_cluster[[c]] = K_AR
  K_AR_periodic_cluster[[c]] = K_AR_periodic
}
for (i in 1:num_clus){
  title1 = sprintf("AR 1 Covariance for Cluster %s",i)
  title2 = sprintf("Periodic AR 1 Covariance for Cluster %s",i)
  # melted_covmat <- melt(K_AR_periodic_list[[i]])
  # K_AR_periodic_heatmap = ggplot(data = melted_covmat, aes(x=Var1, y=Var2, fill=value)) +
  # geom_tile() + ggtitle(title)
  # 
  # print(K_AR_periodic_heatmap)
  
  # corrplot(K_AR_cluster[[i]], is.corr = FALSE, method = 'color', tl.cex = 0.5, title = title1)
  # corrplot(K_AR_periodic_cluster[[i]], is.corr = FALSE, method = 'color', tl.cex = 0.5, title = title2)
  
  matrix_heatmap(K_AR_cluster[[i]],title = title1)
  matrix_heatmap(K_AR_periodic_cluster[[i]],title = title2)

}

Combine each cluster’s AR 1 kernel together:

K_AR = matrix(0,nrow=60,ncol=60)
K_AR_periodic = matrix(0,nrow=60,ncol=60)

for(i in 1:num_clus){
  K_AR = K_AR + ((1/num_clus)*K_AR_cluster[[i]])
  K_AR_periodic = K_AR_periodic + ((1/num_clus)*K_AR_periodic_cluster[[i]])
}

#Heatmap of resulting K 
# melted_covmat <- melt(t(K_AR_periodic))
# K_AR_periodic_heatmap = ggplot(data = melted_covmat, aes(x=Var1, y=Var2, fill=value)) + 
#   geom_tile() + ggtitle("AR 1 Covariance Structure")
# 
# K_AR_periodic_heatmap

# corrplot(K_AR, is.corr = FALSE, method = 'color', tl.cex = 0.5, title = "AR 1 Covariance Structure")
# corrplot(K_AR_periodic, is.corr = FALSE, method = 'color', tl.cex = 0.5, title = "Periodic AR 1 Covariance Structure")

matrix_heatmap(K_AR,title = "AR 1 Covariance Structure")

matrix_heatmap(K_AR_periodic,title = "Periodic AR 1 Covariance Structure")

Calculating distributed lag structure

Here, we wish to account for lagged effects that we believe are significant (3,6, and 12 months). Note that we add the seasonal component to the residual component of our covariate time series decomposition to get our input for this kernel.

DL_invariant_list = list()

for (c in 1:num_clus){
  
  #Grab S_DL data for cluster c
  cluster_data = decomposed_cluster_data[[c]]
  S_DL_clus = cluster_data$S_DL
  
  dl3.corr.values = c()
  dl3.cov.values = c()
  
  dl6.corr.values = c()
  dl6.cov.values = c()
  
  dl12.corr.values = c()
  dl12.cov.values = c()
  
  for (i in 1:ncol(S_DL_clus)){
    var = var(S_DL_clus[,i])
    fit.ar = ar(S_DL_clus[,i],order.max = 1, aic = FALSE, method = "yule-walker")
    
    #Fit a parametric AR model for each lag 
    dl3 = arima(S_DL_clus[,i],order = c(3,0,0),seasonal = c(0,0,0),include.mean = FALSE,fixed = c(0,0,NA))
    
    dl6 = arima(S_DL_clus[,i],order = c(6,0,0),seasonal = c(0,0,0),include.mean = FALSE,fixed = c(0,0,0,0,0,NA))
    
    dl12 = arima(S_DL_clus[,i],order = c(12,0,0),seasonal = c(0,0,0),include.mean = FALSE,
                 fixed = c(0,0,0,0,0,0,0,0,0,0,0,NA))
    
    #Calculate correlations and covariances from coefficient estimates
    corr.dl3 = as.numeric(dl3$coef[3])
    cov.dl3 = as.numeric(dl3$coef[3]) * var
    
    corr.dl6 = as.numeric(dl6$coef[6])
    cov.dl6 = as.numeric(dl6$coef[6]) * var
    
    corr.dl12 = as.numeric(dl12$coef[12])
    cov.dl12 = as.numeric(dl12$coef[12]) * var
    
    
    dl3.corr.values = c(dl3.corr.values,corr.dl3)
    dl3.cov.values = c(dl3.cov.values,cov.dl3)
    
    dl6.corr.values = c(dl6.corr.values,corr.dl6)
    dl6.cov.values = c(dl6.cov.values,cov.dl6)
    
    dl12.corr.values = c(dl12.corr.values,corr.dl12)
    dl12.cov.values = c(dl12.cov.values,cov.dl12)
  }
  
  # Run if you want to create a DL_invariant matrix for each pollutant 
  # for (j in 1:ncol(S_DL_clus)){
  #   DL_invariant_covmatrix = diag(nrow(S_DL_clus))
  #   
  #   DL_invariant_covmatrix[row(DL_invariant_covmatrix) == col(DL_invariant_covmatrix) - 3] = dl3.cov.values[j]
  #   DL_invariant_covmatrix[row(DL_invariant_covmatrix) == col(DL_invariant_covmatrix) + 3] = dl3.cov.values[j]
  #   
  #   DL_invariant_covmatrix[row(DL_invariant_covmatrix) == col(DL_invariant_covmatrix) - 6] = dl6.cov.values[j]
  #   DL_invariant_covmatrix[row(DL_invariant_covmatrix) == col(DL_invariant_covmatrix) + 6] = dl6.cov.values[j]
  #   
  #   DL_invariant_covmatrix[row(DL_invariant_covmatrix) == col(DL_invariant_covmatrix) - 12] = dl12.cov.values[j]
  #   DL_invariant_covmatrix[row(DL_invariant_covmatrix) == col(DL_invariant_covmatrix) + 12] = dl12.cov.values[j]
  #   
  #   # title = sprintf("Covariance of %s",colnames(S_DL_all)[j])
  #   # corrplot(DL_invariant_covmatrix, is.corr = FALSE, method = 'color', tl.cex = 0.5, title = title)
  
  #   # matrix_heatmap(DL_invariant_covmatrix,title = title)

  # }
  
  DL_invariant_covmatrix = diag(nrow(S_DL_clus))
  
  DL_invariant_covmatrix[row(DL_invariant_covmatrix) == col(DL_invariant_covmatrix) - 3] = sum(dl3.cov.values)*(1/8)
  DL_invariant_covmatrix[row(DL_invariant_covmatrix) == col(DL_invariant_covmatrix) + 3] = sum(dl3.cov.values)*(1/8)
  
  DL_invariant_covmatrix[row(DL_invariant_covmatrix) == col(DL_invariant_covmatrix) - 6] = sum(dl6.cov.values)*(1/8)
  DL_invariant_covmatrix[row(DL_invariant_covmatrix) == col(DL_invariant_covmatrix) + 6] = sum(dl6.cov.values)*(1/8)
  
  DL_invariant_covmatrix[row(DL_invariant_covmatrix) == col(DL_invariant_covmatrix) - 12] = sum(dl12.cov.values)*(1/8)
  DL_invariant_covmatrix[row(DL_invariant_covmatrix) == col(DL_invariant_covmatrix) + 12] = sum(dl12.cov.values)*(1/8)
  
  DL_invariant_list[[c]] = DL_invariant_covmatrix
  
  corrplot(DL_invariant_covmatrix, is.corr = FALSE, method = 'color', tl.cex = 0.5, title = "Invariant DL (3,6,12) Covariance Structure")
}

#Combine DL covariance matrices from each cluster together
K_DL_invariant = matrix(0,nrow=60,ncol=60)

for(i in 1:num_clus){
  K_DL_invariant = K_DL_invariant + ((1/num_clus)*DL_invariant_list[[i]])
}

# corrplot(K_DL_invariant, is.corr = FALSE, method = 'color', tl.cex = 0.5, title = "Invariant DL(3,6,12) Covariance Structure")
matrix_heatmap(K_DL_invariant,title = "Invariant DL(3,6,12) Covariance Structure")

K_DL_cluster = list()
K_DL_periodic_cluster = list()

for (c in 1:num_clus){
  
  #Grab S_DL data for cluster c
  cluster_data = decomposed_cluster_data[[c]]
  S_DL_clus = cluster_data$S_DL
  
  #Create a list to store covariance matrix for each DL 
  K_DL_list = list()
  K_DL_periodic_list = list()
  
  dl_lags = c(3,6,12)
  tracker = 1
  
  for (i in dl_lags){
    
    K_DL = matrix(nrow=time_span,ncol=time_span)
    K_DL_periodic = matrix(nrow=time_span,ncol=time_span)
    
    rho_DL = 1
    sigma2_DL = 1
    
    #Calculate DL covariance matrix for specified lag   
    for(j in 1:nrow(S_DL_clus)){
      for (k in 1:nrow(S_DL_clus)){
        
        if ((abs(j-k) == 0) || (abs(j-k) == i)){
          
          K_DL[j,k] = exp(- (sum(S_DL_clus[j,] - S_DL_clus[k,])^2) / (2*rho_DL)) * sigma2_DL
          
          K_DL_periodic[j,k] = exp(- (sum(S_DL_clus[j,] - S_DL_clus[k,])^2)
                               / (2*rho_DL)) * exp(- (2*sin(sum(abs(S_DL_clus[j,] - S_DL_clus[k,]))*3.14/12)^2)
                               / (rho_DL)) * sigma2_DL
          
        } 
        else{
          K_DL_periodic[j,k] = 0
          K_DL[j,k] = 0
          }
      }
    }
    
    K_DL_list[[tracker]] = K_DL
    K_DL_periodic_list[[tracker]] = K_DL_periodic
    tracker = tracker+1
  }
  
  #Combine the 3 DL covariance matrices together
  K_DL = matrix(0,nrow=time_span,ncol=time_span)
  K_DL_periodic = matrix(0,nrow=time_span,ncol=time_span)
  
  for(i in 1:length(K_DL_periodic_list)){
    K_DL = K_DL + ((1/3)*K_DL_list[[i]])
    K_DL_periodic = K_DL_periodic + ((1/3)*K_DL_periodic_list[[i]])
  }
  
  #Store DL(3,6,12) covariance matrix for each cluster 
  K_DL_cluster[[c]] = K_DL
  K_DL_periodic_cluster[[c]] = K_DL
}

Combining DL kernels for each cluster together:

K_DL = matrix(0,nrow=time_span,ncol=time_span)
K_DL_periodic = matrix(0,nrow=time_span,ncol=time_span)

for(i in 1:num_clus){
  K_DL = K_DL + ((1/num_clus)*K_DL_cluster[[i]])
  K_DL_periodic = K_DL_periodic + ((1/num_clus)*K_DL_periodic_cluster[[i]])
}

#Heatmap of resulting K 
# melted_covmat <- melt(K_DL_periodic)
# K_DL_periodic_heatmap = ggplot(data = melted_covmat, aes(x=Var1, y=Var2, fill=value)) + 
#   geom_tile() + ggtitle("Distributed Lag (3,6,12) Covariance Structure")
# 
# K_DL_periodic_heatmap

# corrplot(K_DL, is.corr = FALSE, method = 'color', tl.cex = 0.5, title = "DL (3,6,12) Covariance Structure")
# corrplot(K_DL_periodic, is.corr = FALSE, method = 'color', tl.cex = 0.5, title = "Periodic DL (3,6,12) Covariance Structure")

matrix_heatmap(K_DL,title = "DL (3,6,12) Covariance Structure")

matrix_heatmap(K_DL_periodic,title = "Periodic DL (3,6,12) Covariance Structure")

Calculating interaction structure

Finally, we want to include the two way interaction structures for every pair of EPA covariate time series. It is reasonable to think that the value of an air pollutant covariate at one time point may affect the value of another air pollutant at the same or even future time point. Interaction pairs are calculated by performing the kronecker product on two of time series vectors.

K_Interaction_cluster = list()
K_Interaction_periodic_cluster = list()

for (c in 1:num_clus){
  
  #Grab interaction pair data for cluster c
  cluster_data = decomposed_cluster_data[[c]]
  W2_clus = cluster_data$W2
  
  K_interaction_list = list()
  K_interaction_periodic_list = list()
  
  column_names = colnames(W2_clus)
  time_span = nrow(W2_clus)
  
  #Create sequence of indices corresponding to comparisons for real time and one lag interaction effects
  lag0_idx = seq(2,3601,by=61)
  lag1_idx = seq(1,3600,by=61)
  
  #Calculate a kernel for each interaction pair 
  for (a in 1:length(column_names)){
    interaction =  W2_clus[,a]
    
    #First calculate these two interaction kernels separately 
    K_int0 = matrix(nrow = 60,ncol = 60)
    K_int1 = matrix(nrow = 60,ncol = 60)
    
    K_int0_periodic = matrix(nrow = 60,ncol = 60)
    K_int1_periodic = matrix(nrow = 60,ncol = 60)
    
    rho_int = 1
    sigma2_int = 1
    
    for (i in 1:60){
      for (j in 1:60){
        
        #RBF kernels
        K_int0[i,j] = exp(- ((interaction[lag0_idx[i]] - interaction[lag0_idx[j]])^2)
                            / (2*rho_int)) * sigma2_int
  
        K_int1[i,j] = exp(- ((interaction[lag1_idx[i]] - interaction[lag1_idx[j]])^2)
                            / (2*rho_int)) * sigma2_int
        
        #Locally periodic kernels 
        K_int0_periodic[i,j] = exp(- ((interaction[lag0_idx[i]] - interaction[lag0_idx[j]])^2)
                         / (2*rho_int)) * exp(- (2*sin((abs(interaction[lag0_idx[i]] - interaction[lag0_idx[j]]))*3.14/12)^2)
                         / (rho_int)) * sigma2_int
            
        K_int1_periodic[i,j] = exp(- ((interaction[lag1_idx[i]] - interaction[lag1_idx[j]])^2)
                             / (2*rho_int)) * exp(- (2*sin((abs(interaction[lag1_idx[i]] - interaction[lag1_idx[j]]))*3.14/12)^2)
                             / (rho_int)) * sigma2_int
      }
    }
    
    #Combine real time and one lag interaction kernels together
    K_interaction = 0.5*K_int0 + 0.5*K_int1
    K_interaction_list[[a]] = K_interaction
    
    K_interaction_periodic = 0.5*K_int0_periodic + 0.5*K_int1_periodic
    K_interaction_periodic_list[[a]] = K_interaction_periodic
  }
  
  #Combine kernels for each interaction pair together
  K_interaction = matrix(0,nrow=60,ncol=60)
  K_interaction_periodic = matrix(0,nrow=60,ncol=60)
  
  for(i in 1:length(K_interaction_periodic_list)){
    K_interaction = K_interaction + ((1/length(K_interaction_list))*K_interaction_list[[i]])
    
    K_interaction_periodic = K_interaction_periodic + ((1/length(K_interaction_periodic_list))*K_interaction_periodic_list[[i]])
  }
  
  # corrplot(K_interaction, is.corr = FALSE, method = 'color', tl.cex = 0.5, title = "Interaction Covariance Structure")
  # corrplot(K_interaction_periodic, is.corr = FALSE, method = 'color', tl.cex = 0.5, title = "Periodic Interaction Covariance Structure")
  
  matrix_heatmap(K_interaction,title = "Interaction Covariance Structure")
  matrix_heatmap(K_interaction_periodic,title = "Periodic Interaction Covariance Structure")

  
  #Store final interaction kernel (for all pairs) for each cluster 
  K_Interaction_cluster[[c]] = K_interaction
  K_Interaction_periodic_cluster[[c]] = K_interaction_periodic
}

Combining interaction kernels from each cluster together:

K_interaction = matrix(0,nrow=60,ncol=60)
K_interaction_periodic = matrix(0,nrow=60,ncol=60)

for(i in 1:num_clus){
  K_interaction = K_interaction + ((1/length(K_Interaction_cluster))*K_Interaction_cluster[[i]])
  
  K_interaction_periodic = K_interaction_periodic + ((1/length(K_Interaction_periodic_cluster))*K_Interaction_periodic_cluster[[i]])
}

# corrplot(K_interaction, is.corr = FALSE, method = 'color', tl.cex = 0.5, title = "Interaction Covariance Structure")
# corrplot(K_interaction_periodic, is.corr = FALSE, method = 'color', tl.cex = 0.5, title = "Periodic Interaction Covariance Structure")

matrix_heatmap(K_interaction,title = "Interaction Covariance Structure")

matrix_heatmap(K_interaction_periodic,title = "Periodic Interaction Covariance Structure")

Kernel target alignment calculation between 3 variations of each kernel

F_norm = function(kernel){
  
  total = sum(as.numeric(abs(kernel)^2))
  Fnorm = sqrt(total)
  return(Fnorm)
}

KTA_norm = function(kernel1,kernel2){
  
  centered_k1 = t(diag(nrow(kernel1)) - (1/nrow(kernel1) * t(diag(nrow(kernel1))) %*% diag(nrow(kernel1)))) %*%
    kernel1 %*% diag(nrow(kernel1)) - (1/nrow(kernel1) * t(diag(nrow(kernel1))) %*% diag(nrow(kernel1)))
  
  centered_k2 = t(diag(nrow(kernel2)) - (1/nrow(kernel2) * t(diag(nrow(kernel2))) %*% diag(nrow(kernel2)))) %*%
    kernel2 %*% diag(nrow(kernel2)) - (1/nrow(kernel2) * t(diag(nrow(kernel2))) %*% diag(nrow(kernel2)))
  
  term1 = centered_k1 / (F_norm(centered_k1))
  term2 = centered_k2 / (F_norm(centered_k2))
  term3 = term1 - term2
  term4 = F_norm(term3)
  
  measure = 1 - 0.5*(term4^2)
  return(measure)
}

KTA_rownames = c("AR Invariant-AR RBF", "AR Invariant-AR LP",
                 "AR RBF-AR LP ", "DL Invariant-DL RBF",
                 "DL Invariant-DL LP", "DL RBF-DL LP",
                 "Interaction RBF-Interaction LP")

KTA_table = matrix(c(KTA_norm(K_AR_invariant,K_AR),
                       KTA_norm(K_AR_invariant,K_AR_periodic),
                       KTA_norm(K_AR,K_AR_periodic),
                       KTA_norm(K_DL_invariant,K_DL),
                       KTA_norm(K_DL_invariant,K_DL_periodic),
                       KTA_norm(K_DL,K_DL_periodic),
                       KTA_norm(K_interaction,K_interaction_periodic)),
                   nrow=7)

KTA_table = data.frame(KTA_table)
rownames(KTA_table) = KTA_rownames
colnames(KTA_table) = "Centered KTA via inner product"

KTA_table
##                                Centered KTA via inner product
## AR Invariant-AR RBF                                 0.5997390
## AR Invariant-AR LP                                  0.6107497
## AR RBF-AR LP                                        0.9997572
## DL Invariant-DL RBF                                 0.7963747
## DL Invariant-DL LP                                  0.7963747
## DL RBF-DL LP                                        1.0000000
## Interaction RBF-Interaction LP                      0.9999317

Cleaning and aggregating CalViDa mortality data

Imputing “< 11” values in data with EM algorithm

In the mortality dataset obtained from Cal-ViDa, all of the cells with small values i.e., less than 10 but not equal to 0, were censored. So in order to avoid using a truncated Poisson distribution, we decided to impute these censored values with an EM algorithm which is described below:

Due to interval censoring, we do not observe the exact mortality counts for some units. Let \(C_i\) be the censoring indicator such that \(C_i = I(1 \le Y_i \le 10)\), i.e., the count is censored if it is between 1 and 10. For the EM algorithm, we will be modeling the rate using a generalized linear model where we use the variables month (our time index), cause of death, and age group as predictors. So let us assume that \(\lambda_i(\beta) = \exp(\alpha_i + \textbf{X}_i^T \boldsymbol{\beta})\), where \(\alpha_i\) is the offset (log of county population of the age group) and \(\textbf{X}_i\) are predictors (month, cause of death, age group). Derivations for the EM algorithm can be found in appendix B of the paper.

We coded this algorithm as follows:

First, we needed to get an initial estimate of our \(\beta\) coefficients in our Poisson regression model. We included age group, county of death, cause of death (either influenza+pneumonia OR chronic lower respiratory disease), and month of death as covariates.

To do this, we needed to perform an initial imputation to get a complete dataset to fit a model on. We decided to do this by making a crude estimation of the rate per 100,000 people \(\lambda_i\). To do this, we calculated a population weighted mean of the number of deaths across all ages and months. However, we only had observed populations at the county level, not for each specific age group included in the mortality data. So using census data which told us the approximate populations for specific age groups (for all of California), we were able to calculate approximate population sizes for each of the age categories included in the mortality data. See death_byage2 for reference. Then using the ratio of a given county’s population relative to the entire population of California, we were able to calculate approximations for the population size of each age group for each county in our mortality dataset. These served as the weights for our population weighted average of the rate of respiratory deaths in California.

population_age = read_xlsx("Population Categories.xlsx")
## New names:
## • `` -> `...2`
## • `` -> `...3`
## • `` -> `...4`
## • `` -> `...5`
## • `` -> `...6`
## • `` -> `...7`
## • `` -> `...8`
## • `` -> `...9`
## • `` -> `...10`
## • `` -> `...11`
## • `` -> `...12`
## • `` -> `...13`
#head(population_age)

population_age = population_age[-(1:5),2]
population_age$...2 = as.numeric(population_age$...2)

#split under 5 category into < 1 and 1-4 years old 
less1 = floor(population_age$...2[1]*0.2)
onefour = floor(population_age$...2[1]*0.8)
death_byage = population_age$...2[-1]
death_byage = c(less1,onefour,death_byage)
death_byage = death_byage[1:19]

death_byage2 = death_byage[1:2]
idx = seq(from = 3, to = 17, by = 2)
for (i in idx){
  death_byage2 = c(death_byage2,(death_byage[i]+death_byage[i+1]))
}
death_byage2 = c(death_byage2,death_byage[19])


age_groups = unique(mortality$Age)
death_byage2 = data.frame(cbind(age_groups,death_byage2))
colnames(death_byage2) = c("Age_Group","Population_by_Age")
death_byage2$Population_by_Age = as.numeric(death_byage2$Population_by_Age)
#head(death_byage2)

#2010-2019 population data for CA 
USpops = read.csv("CA_census_pops1019.csv")
CApops = USpops %>% filter(STNAME == "California") %>% select(CTYNAME,POPESTIMATE2019)
counties = countycodes$value_represented #from EPA data file

weights = CApops[(2:59),2]
weights = weights/CApops[1,2]

groups = unique(mortality$Age)
step1 = 1
step2 = 1

for (i in counties){
  for (j in groups){
    idx = which(mortality$Age == j & mortality$County_of_Death == i)
    mortality$Population[idx] = ceiling(death_byage2$Population[step1]*weights[step2])
    step1 = step1+1
  }  
  step1 = 1
  step2 = step2+1
}

mortality$logpop = log(mortality$Population)

censored_idx = which(mortality$Total_Deaths == "<11")
censorTF = mortality$Total_Deaths == "<11"
mortality = cbind(mortality,censorTF)
#head(mortality)

GETTING INITIAL GUESS FOR LAMBDA: AVG DEATHS (PER 100K PEOPLE) PER MONTH FOR ONE COUNTY

uncensored_mortality = mortality %>% filter(censorTF == FALSE) %>% select(Total_Deaths,Population)
uncensored_mortality$Total_Deaths = as.numeric(uncensored_mortality$Total_Deaths)

theta = mean(uncensored_mortality$Total_Deaths*100000/uncensored_mortality$Population)

By using all the data, I obtained a crude initial guess for \(\lambda\) of about 1.08. Using this initial estimate \(\lambda\), we calculated the expected value for each \(Z_i\) to get an initial imputed dataset. This dataset will be used to estimate a Poisson regression model which will give us our initial value for our actual parameters of interest \(\beta\).

FUNCTION FOR IMPUTING CENSORED VALUE BASED ON EXPECTATION GIVEN LAMBDA

impute_small_values = function(lambda){
  x = 1:10
  p = dpois(x,lambda)
  
  value = sum(x*p)/sum(p)
  return(value)
}

INITIAL IMPUTATION:

mortality2 = mortality
mortality2$Total_Deaths[censored_idx] = 0.01
mortality2$Total_Deaths = as.numeric(mortality2$Total_Deaths)

for (i in censored_idx){
  lambda = theta*mortality2$Population[i] / 100000
  deaths = impute_small_values(lambda)
  
  mortality2$Total_Deaths[i] = floor(deaths)
}

INITIAL REGRESSION MODELS:

mortality2$Age = factor(mortality2$Age)
mortality2$Cause_of_Death = factor(mortality2$Cause_of_Death)
mortality2$Month = factor(mortality2$Month)

pois_reg = glm(Total_Deaths ~ Age + Cause_of_Death + Month + offset(logpop), family = "poisson", data = mortality2)
# ZIP_reg = zeroinfl(Total_Deaths ~ Age + Cause_of_Death + Month + offset(logpop) | 1, data = mortality2, dist = "poisson", link = "logit")

vec0 = coef(pois_reg)
# vec0 = coef(ZIP_reg)

summary(pois_reg)
## 
## Call:
## glm(formula = Total_Deaths ~ Age + Cause_of_Death + Month + offset(logpop), 
##     family = "poisson", data = mortality2)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -5.4252  -0.5207  -0.2124  -0.0650  14.5038  
## 
## Coefficients:
##                                         Estimate Std. Error  z value Pr(>|z|)
## (Intercept)                           -13.744433   0.087789 -156.562  < 2e-16
## Age15 - 24 years                        0.568750   0.095519    5.954 2.61e-09
## Age25 - 34 years                        1.081482   0.091812   11.779  < 2e-16
## Age35 - 44 years                        1.667882   0.090131   18.505  < 2e-16
## Age45 - 54 years                        2.381811   0.088827   26.814  < 2e-16
## Age5 - 14 years                         0.236541   0.099117    2.386    0.017
## Age55 - 64 years                        3.136398   0.088079   35.609  < 2e-16
## Age65 - 74 years                        4.183664   0.087714   47.697  < 2e-16
## Age75 - 84 years                        5.352522   0.087582   61.115  < 2e-16
## Age85 years and over                    6.499169   0.087521   74.258  < 2e-16
## AgeLess than 1 year                     0.965400   0.138658    6.962 3.34e-12
## Cause_of_DeathInfluenza and pneumonia  -0.751349   0.007035 -106.798  < 2e-16
## Month2                                 -0.253385   0.013643  -18.573  < 2e-16
## Month3                                 -0.251082   0.013634  -18.416  < 2e-16
## Month4                                 -0.450337   0.014455  -31.155  < 2e-16
## Month5                                 -0.522552   0.014781  -35.352  < 2e-16
## Month6                                 -0.646664   0.015382  -42.040  < 2e-16
## Month7                                 -0.661526   0.015458  -42.796  < 2e-16
## Month8                                 -0.709630   0.015707  -45.180  < 2e-16
## Month9                                 -0.756347   0.015957  -47.400  < 2e-16
## Month10                                -0.651645   0.015407  -42.294  < 2e-16
## Month11                                -0.617588   0.015237  -40.532  < 2e-16
## Month12                                -0.357779   0.014059  -25.449  < 2e-16
##                                          
## (Intercept)                           ***
## Age15 - 24 years                      ***
## Age25 - 34 years                      ***
## Age35 - 44 years                      ***
## Age45 - 54 years                      ***
## Age5 - 14 years                       *  
## Age55 - 64 years                      ***
## Age65 - 74 years                      ***
## Age75 - 84 years                      ***
## Age85 years and over                  ***
## AgeLess than 1 year                   ***
## Cause_of_DeathInfluenza and pneumonia ***
## Month2                                ***
## Month3                                ***
## Month4                                ***
## Month5                                ***
## Month6                                ***
## Month7                                ***
## Month8                                ***
## Month9                                ***
## Month10                               ***
## Month11                               ***
## Month12                               ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for poisson family taken to be 1)
## 
##     Null deviance: 382636  on 91871  degrees of freedom
## Residual deviance:  74194  on 91849  degrees of freedom
## AIC: 126734
## 
## Number of Fisher Scoring iterations: 7
# summary(ZIP_reg)

Now that we have initialized our parameters, \(\beta^{(0)}\), we can proceed with the EM algorithm until our parameters (the coefficients of our regression model), converge.

The main steps implemented in the chunk below are:

  1. Given a newly fitted Poisson regression model with parameter values \(\beta^{(t)}\), take its fitted values for the \(\lambda\)’s corresponding to observations that were censored in the original mortality dataset

  2. Use those fitted \(\lambda\)’s, calculate the expected value of our unknown values \(Z\)

  3. Once all \(Z_i\)’s are imputed, we can use the now complete dataset to estimate the Poisson regression model again, which will produce the maximum likelihood estimate of our parameters \(\beta\), these are our new values \(\beta^{(t+1)}\).

  4. Compare the difference between our new \(\beta\) coefficient estimates with those from the previous iteration and either perform another iteration or stop the algorithm if the maximum difference between coefficients from different iterations is less than 0.01.

Note: I experimented with a ZIP regression model as well but the log likelihood values at each iteration were generally higher for the Poisson regression model

mortality3 = mortality2
model = pois_reg

model_diff = 100
iter = 1
vec0 = coef(model)

while((model_diff > 0.01) & (iter < 10)){
  
  #impute data (should be between 1-10)
  fvs = fitted.values(model)
  
  for (i in censored_idx){
    deaths = impute_small_values(fvs[i])
    mortality3$Total_Deaths[i] = floor(deaths)
  }
  
  #fit model on "new" data
  model = glm(Total_Deaths ~ Age + Cause_of_Death + Month + offset(logpop), family = "poisson", data = mortality3)
  vec1 = coef(model)
  
  model_diff = max(abs(vec1 - vec0))
  iter = iter+1
  vec0 = vec1
}

final_pois_reg = model
summary(final_pois_reg)
## 
## Call:
## glm(formula = Total_Deaths ~ Age + Cause_of_Death + Month + offset(logpop), 
##     family = "poisson", data = mortality3)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -4.9268  -0.3514  -0.1387  -0.0449  12.3014  
## 
## Coefficients:
##                                         Estimate Std. Error  z value Pr(>|z|)
## (Intercept)                           -14.351349   0.116534 -123.152  < 2e-16
## Age15 - 24 years                       -0.054190   0.135792   -0.399 0.689845
## Age25 - 34 years                        0.430115   0.127371    3.377 0.000733
## Age35 - 44 years                        1.210281   0.122028    9.918  < 2e-16
## Age45 - 54 years                        2.345599   0.118274   19.832  < 2e-16
## Age5 - 14 years                        -0.455007   0.145826   -3.120 0.001807
## Age55 - 64 years                        3.744079   0.116771   32.063  < 2e-16
## Age65 - 74 years                        5.001304   0.116457   42.946  < 2e-16
## Age75 - 84 years                        6.168765   0.116379   53.006  < 2e-16
## Age85 years and over                    7.285988   0.116346   62.623  < 2e-16
## AgeLess than 1 year                     1.536529   0.158560    9.691  < 2e-16
## Cause_of_DeathInfluenza and pneumonia  -0.719013   0.006477 -111.013  < 2e-16
## Month2                                 -0.245579   0.012764  -19.240  < 2e-16
## Month3                                 -0.234760   0.012726  -18.448  < 2e-16
## Month4                                 -0.416647   0.013416  -31.057  < 2e-16
## Month5                                 -0.488946   0.013715  -35.649  < 2e-16
## Month6                                 -0.603208   0.014221  -42.418  < 2e-16
## Month7                                 -0.631446   0.014352  -43.998  < 2e-16
## Month8                                 -0.672621   0.014547  -46.237  < 2e-16
## Month9                                 -0.708569   0.014723  -48.128  < 2e-16
## Month10                                -0.618620   0.014292  -43.285  < 2e-16
## Month11                                -0.580590   0.014118  -41.125  < 2e-16
## Month12                                -0.337932   0.013106  -25.784  < 2e-16
##                                          
## (Intercept)                           ***
## Age15 - 24 years                         
## Age25 - 34 years                      ***
## Age35 - 44 years                      ***
## Age45 - 54 years                      ***
## Age5 - 14 years                       ** 
## Age55 - 64 years                      ***
## Age65 - 74 years                      ***
## Age75 - 84 years                      ***
## Age85 years and over                  ***
## AgeLess than 1 year                   ***
## Cause_of_DeathInfluenza and pneumonia ***
## Month2                                ***
## Month3                                ***
## Month4                                ***
## Month5                                ***
## Month6                                ***
## Month7                                ***
## Month8                                ***
## Month9                                ***
## Month10                               ***
## Month11                               ***
## Month12                               ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for poisson family taken to be 1)
## 
##     Null deviance: 455249  on 91871  degrees of freedom
## Residual deviance:  38553  on 91849  degrees of freedom
## AIC: 97178
## 
## Number of Fisher Scoring iterations: 8
logLik(final_pois_reg)
## 'log Lik.' -48566.18 (df=23)

Alternatively, we can take the first and second derivative and apply a Newton-Raphson procedure if we want to solve for \(\beta\) numerically. To solve for the fixed point solutions for \(\beta\) numerically, we would first take the derivative of \(Q()\) wrt each \(\beta_j\) which gives us:

\(\frac{dQ}{d\beta_j} = \sum_{i \in D \backslash W}^{n_{obs}}[y_i - exp(\alpha_i + x_i \beta)]x_{ij} + \sum_{i \in W}^N [\tilde y_i - exp(\alpha_i + x_i \beta)]x_{ij}\) where \(\alpha_i\) represents the offset associated with each observation \(i\) and \(\tilde y_i\) represents the imputed values corresponding to censored observation \(Z_i\)

Rewriting this in vector form (since we have 3 covariates, which are all categorical) we obtain the following gradient:

\(f'(\beta) = \frac{dQ}{d \overrightarrow \beta} = X^T [\overrightarrow y - exp(\overrightarrow \alpha + X \beta)]\) where first term is a \(p\times n\) matrix and second term is a \(n\times 1\) vector

Then, we take another derivative to get the Hessian:

\(f''_{jj'}(\beta) = -\sum_{i=1}^N exp(\alpha_i + x_i \beta) X_{ij} X_{ij'} = -X^T diag(exp(\overrightarrow \alpha + X \beta)) X\)

Finally, we solve for the next value for our \(\beta\)s using these two values with the following equation:

\(\beta^{(b)} = \beta^{(b-1)} - [(f''(\beta^{(b-1)}))^{-1} f'(\beta^{(b-1)})]\)

Now we need to code up a Newton Raphson function and initialize it:

Y = mortality2$Total_Deaths

#X is design matrix w col of 1s then each level of each categorical predictors except their baselines 
intercept = rep(1,length(Y))

#Age categories 
Age1524 = as.numeric(mortality2$Age == "15 - 24 years")
Age2534 = as.numeric(mortality2$Age == "25 - 34 years")
Age3544 = as.numeric(mortality2$Age == "35 - 44 years")
Age4554 = as.numeric(mortality2$Age == "45 - 54 years")
Age514 = as.numeric(mortality2$Age == "5 - 14 years")
Age5564 = as.numeric(mortality2$Age == "55 - 64 years")
Age6574 = as.numeric(mortality2$Age == "65 - 74 years")
Age7584 = as.numeric(mortality2$Age == "75 - 84 years")
Age85 = as.numeric(mortality2$Age == "85 years and over")
Age1 = as.numeric(mortality2$Age == "Less than 1 year")

#Cause of death categories 
Cause2 = as.numeric(mortality2$Cause_of_Death == "Influenza and pneumonia")

#Month categories 
Month2 = as.numeric(mortality2$Month == 2)
Month3 = as.numeric(mortality2$Month == 3)
Month4 = as.numeric(mortality2$Month == 4)
Month5 = as.numeric(mortality2$Month == 5)
Month6 = as.numeric(mortality2$Month == 6)
Month7 = as.numeric(mortality2$Month == 7)
Month8 = as.numeric(mortality2$Month == 8)
Month9 = as.numeric(mortality2$Month == 9)
Month10 = as.numeric(mortality2$Month == 10)
Month11 = as.numeric(mortality2$Month == 11)
Month12 = as.numeric(mortality2$Month == 12)

X = cbind(intercept,Age1524,Age2534,Age3544,Age4554,Age514,Age5564,Age6574,Age7584,Age85,Age1,
          Cause2,Month2,Month3,Month4,Month5,Month6,Month7,Month8,Month9,Month10,
          Month11,Month12)
# dim(X)

offset_vec = offset(mortality2$logpop)
offset_vec = matrix(offset_vec,ncol=1)

#Initial guesses for beta
B = coef(pois_reg)
B = matrix(B,ncol=1)

#Define first derivative of Q function
f_gradient = function(Y,X,B){
  value = t(X) %*% (Y - exp(X %*% B + offset_vec))
  return(value)
}

# f_gradient(Y,X,B)

#Define second derivative of Q function
f_hessian = function(Y,X,B){
  middle = as.numeric(exp(X %*% B + offset_vec))
  X2 = X
  
  for (i in 1:length(middle)){
    X2[i,] = X[i,] * middle[i]
  }
  
  value = -t(X) %*% X2
  
  return(value)
}

# dim(f_hessian(Y,X,B))

#Define Newton Raphson function and compute initial beta coefficient estimates 
Newton_Raphson = function(Y,X,x0,tol = 0.001,eps = 0.01,max_iter = 100){
  for (i in 1:max_iter){
    g = f_gradient(Y,X,x0)
    h = f_hessian(Y,X,x0)
    value = abs(det(h))
    
    if (value < eps){
      break
    }
    
    x1 = x0 - (solve(h) %*% g)
    # x1 = x0 - (solve(h) %*% g * (0.01 * 0.999^i)) #gradient descent is too large at each iteration so need to slow it down
    
    if (max(abs(x1-x0)) <= tol){
      return(x1)
    }
    
    x0 = x1
  }
  
  return(x0)
}

#Initial beta coefficient estimates 
new_coefs = Newton_Raphson(Y,X,B)

Similar to above, now that we have initialized our parameters, \(\beta^{(0)}\), we can proceed with the EM algorithm until our parameters (the coefficients of our regression model), converge.

The main steps implemented in the chunk below are:

  1. Given newly estimated parameter values \(\beta^{(t)}\) from the Newton-Raphson procedure above, take its fitted values for the \(\lambda\)’s corresponding to observations that were censored in the original mortality dataset

  2. Use those fitted \(\lambda\)’s, calculate the expected value of our unknown values \(Z\)

  3. Once all \(Z_i\)’s are imputed, we can use the now complete dataset to estimate the beta coefficients with Newton-Raphson again, which will produce the maximum likelihood estimate of our parameters \(\beta\), these are our new values \(\beta^{(t+1)}\).

  4. Compare the difference between our new \(\beta\) coefficient estimates with those from the previous iteration and either perform another iteration or stop the algorithm if the maximum difference between coefficients from different iterations is less than 0.01.

while((model_diff > 0.01) & (iter < 10)){
  
  #impute data (should be between 1-10)
  fvs_NR = exp((X %*% new_coefs) + offset_vec)
  
  for (i in censored_idx){
    deaths = impute_small_values(fvs_NR[i])
    Y[i] = floor(deaths)
  }
  
  new_coefs2 = Newton_Raphson(Y,X,new_coefs)
  
  model_diff = max(abs(new_coefs2 - new_coefs))
  new_coefs = new_coefs2
  iter = iter+1
  vec0 = vec1
}

new_coefs
##                  [,1]
## intercept -13.7444329
## Age1524     0.5687496
## Age2534     1.0814825
## Age3544     1.6678819
## Age4554     2.3818112
## Age514      0.2365414
## Age5564     3.1363978
## Age6574     4.1836642
## Age7584     5.3525223
## Age85       6.4991688
## Age1        0.9653963
## Cause2     -0.7513495
## Month2     -0.2533848
## Month3     -0.2510821
## Month4     -0.4503366
## Month5     -0.5225520
## Month6     -0.6466641
## Month7     -0.6615256
## Month8     -0.7096304
## Month9     -0.7563473
## Month10    -0.6516455
## Month11    -0.6175879
## Month12    -0.3577794
summary(final_pois_reg)
## 
## Call:
## glm(formula = Total_Deaths ~ Age + Cause_of_Death + Month + offset(logpop), 
##     family = "poisson", data = mortality3)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -4.9268  -0.3514  -0.1387  -0.0449  12.3014  
## 
## Coefficients:
##                                         Estimate Std. Error  z value Pr(>|z|)
## (Intercept)                           -14.351349   0.116534 -123.152  < 2e-16
## Age15 - 24 years                       -0.054190   0.135792   -0.399 0.689845
## Age25 - 34 years                        0.430115   0.127371    3.377 0.000733
## Age35 - 44 years                        1.210281   0.122028    9.918  < 2e-16
## Age45 - 54 years                        2.345599   0.118274   19.832  < 2e-16
## Age5 - 14 years                        -0.455007   0.145826   -3.120 0.001807
## Age55 - 64 years                        3.744079   0.116771   32.063  < 2e-16
## Age65 - 74 years                        5.001304   0.116457   42.946  < 2e-16
## Age75 - 84 years                        6.168765   0.116379   53.006  < 2e-16
## Age85 years and over                    7.285988   0.116346   62.623  < 2e-16
## AgeLess than 1 year                     1.536529   0.158560    9.691  < 2e-16
## Cause_of_DeathInfluenza and pneumonia  -0.719013   0.006477 -111.013  < 2e-16
## Month2                                 -0.245579   0.012764  -19.240  < 2e-16
## Month3                                 -0.234760   0.012726  -18.448  < 2e-16
## Month4                                 -0.416647   0.013416  -31.057  < 2e-16
## Month5                                 -0.488946   0.013715  -35.649  < 2e-16
## Month6                                 -0.603208   0.014221  -42.418  < 2e-16
## Month7                                 -0.631446   0.014352  -43.998  < 2e-16
## Month8                                 -0.672621   0.014547  -46.237  < 2e-16
## Month9                                 -0.708569   0.014723  -48.128  < 2e-16
## Month10                                -0.618620   0.014292  -43.285  < 2e-16
## Month11                                -0.580590   0.014118  -41.125  < 2e-16
## Month12                                -0.337932   0.013106  -25.784  < 2e-16
##                                          
## (Intercept)                           ***
## Age15 - 24 years                         
## Age25 - 34 years                      ***
## Age35 - 44 years                      ***
## Age45 - 54 years                      ***
## Age5 - 14 years                       ** 
## Age55 - 64 years                      ***
## Age65 - 74 years                      ***
## Age75 - 84 years                      ***
## Age85 years and over                  ***
## AgeLess than 1 year                   ***
## Cause_of_DeathInfluenza and pneumonia ***
## Month2                                ***
## Month3                                ***
## Month4                                ***
## Month5                                ***
## Month6                                ***
## Month7                                ***
## Month8                                ***
## Month9                                ***
## Month10                               ***
## Month11                               ***
## Month12                               ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for poisson family taken to be 1)
## 
##     Null deviance: 455249  on 91871  degrees of freedom
## Residual deviance:  38553  on 91849  degrees of freedom
## AIC: 97178
## 
## Number of Fisher Scoring iterations: 8

NEWTON RAPHSON APPROACH DOES NOT WORK WELL BC AT EACH ITERATION VALUES ARE CHANGING BY TOO MUCH, LEADS TO HESSIAN MATRIX BEING UNINVERTIBLE

Now that we have imputed the censored “< 11” values in the Cal-ViDa dataset, we will now aggregate the data to get total number of respiratory related deaths for each county for every month between 2014-2019.

Empirical 5 number summaries for each variable of interest

print("Summary for respiratory related mortality")
## [1] "Summary for respiratory related mortality"
summary(mortality3$Total_Deaths) #summary of deaths per month, for all counties i.e. all of CA, age groups, and months 2014-2019
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.000   0.000   0.000   1.178   0.000 209.000
print("Summary for EPA data")
## [1] "Summary for EPA data"
for (i in pollutants$parametercodes.code){
  data = final_EPA_data %>% filter(Pollutant == i) 
  print(summary(data$Value)) #summary of values for each pollutant, for all counties i.e. all of CA, all months (2014-2019)
  AQI = data$AQI
}
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.0120  0.0140  0.0160  0.0184  0.0190  0.0655 
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.2292  0.3000  0.3458  0.3719  0.4333  0.6500 
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.2636  0.4864  0.6545  0.6448  0.7773  1.1318 
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   3.402   4.939   7.139   8.351  11.071  21.668 
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
## 0.008118 0.020000 0.026412 0.025109 0.028948 0.037000 
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    7.00   13.00   18.00   20.09   25.00   70.50 
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   4.200   6.700   7.800   8.546   9.600  23.350
summary(AQI)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   25.00   33.00   36.00   38.34   40.50   75.00
print("Summary for SDI data")
## [1] "Summary for SDI data"
summary(soa.data$Score) #summary of SDI score, for all counties i.e. all of CA, all years 2010-2019
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   67.38   93.45  106.95  112.53  131.69  183.67

Reformatting the data set (mainly aggregating)

First, we wanted to combine the number of deaths from the two different causes into a total number of respiratory related deaths for each age group. Then, we combined the total number of deaths for each age group for a given month and county in a singular total for that month and county.

#Aggregating by  cause of death
data = mortality3

data1 = data %>% filter(Cause_of_Death == "Chronic lower respiratory diseases")
data2 = data %>% filter(Cause_of_Death == "Influenza and pneumonia")

newdeaths = data1$Total_Deaths + data2$Total_Deaths
data1$Total_Deaths = newdeaths

respmortality = data1[,-5]


#Creates total deaths by adding deaths of all age groups together 
agg.respmortality = respmortality[1,]
agg.respmortality$Age = as.character(agg.respmortality$Age)
rows2 = seq(1,nrow(respmortality),11)

for (i in rows2){
  agg.respmortality[i,] = respmortality[i,]
  agg.respmortality$Total_Deaths[i] = sum(respmortality$Total_Deaths[i:(i+10)])
  agg.respmortality$Age[i] = "Everyone"
}

agg.respmortality = na.omit(agg.respmortality)
rownames(agg.respmortality) = NULL
total.respmortality = agg.respmortality[,-c(6:9)]

Reformat dataset into time series format (rows are counties, columns are months)

Total deaths:

months = unique(total.respmortality$Month_of_Death)
years = sort(unique(total.respmortality$Year_of_Death))
counties = unique(total.respmortality$County_of_Death)
x = 0

total.mortality.ts = matrix(1,nrow = 58, ncol = 72)

for (k in counties){
  county.ts = c()
  x = x+1
  
  for (i in years){
    for (j in months){
      deaths = total.respmortality %>% filter(County_of_Death == k) %>%  filter(Year_of_Death == i) %>% filter(Month_of_Death == j) %>% select(Total_Deaths) %>% as.numeric()
      county.ts = c(county.ts,deaths)
    }
  } 
  
  total.mortality.ts[x,] = county.ts
}

#Label time series data 
total.mortality.ts = as.data.frame(total.mortality.ts)

dates = c()
x=1
for (i in years){
  for (j in months){
    dates[x] = sprintf("%1.0f/%1.0f",j,i)
    x = x+1
  }
}

colnames(total.mortality.ts) = dates
ID = c(1:58)
total.mortality.ts = cbind(ID,counties,total.mortality.ts)

total.mortality.ts = left_join(clusterlabels,total.mortality.ts,by = "counties")
head(total.mortality.ts)
##    counties Cluster ID 1/2014 2/2014 3/2014 4/2014 5/2014 6/2014 7/2014 8/2014
## 1   Alameda       1  1     84     61     57     57     55     44     44     44
## 2    Alpine       1  2      0      0      0      0      0      0      0      0
## 3    Amador       1  3      2      3      2      2      2      1      3      1
## 4     Butte       2  4     15     10     11      8      5      6      7      6
## 5 Calaveras       1  5      3      4      2      3      0      0      1      0
## 6    Colusa       2  6      0      1      1      0      1      0      0      0
##   9/2014 10/2014 11/2014 12/2014 1/2015 2/2015 3/2015 4/2015 5/2015 6/2015
## 1     42      51      50      62    101     72     63     61     63     46
## 2      0       0       0       0      0      0      0      0      0      0
## 3      1       3       2       1      3      3      3      5      4      4
## 4      7       5       7       9     13      8     10     19      6      8
## 5      1       2       1       1      2      0      2      2      1      2
## 6      1       0       0       0      1      2      0      1      0      0
##   7/2015 8/2015 9/2015 10/2015 11/2015 12/2015 1/2016 2/2016 3/2016 4/2016
## 1     44     40     36      41      41      64     78     56     62     55
## 2      0      0      0       0       0       0      0      0      0      0
## 3      2      2      2       2       2       4      3      3      2      2
## 4      6     10      3       7       8       6     13      6      8      7
## 5      2      1      2       0       1       4      3      2      4      2
## 6      0      0      1       1       1       2      1      0      1      0
##   5/2016 6/2016 7/2016 8/2016 9/2016 10/2016 11/2016 12/2016 1/2017 2/2017
## 1     39     44     47     43     37      53      48      60    114     66
## 2      0      0      0      0      0       1       0       0      0      0
## 3      0      0      1      1      2       0       2       2      4      2
## 4     18      4      4      6      7       6       8       8     11     11
## 5      3      2      3      2      1       0       0       3      2      1
## 6      0      0      0      0      0       0       0       2      0      1
##   3/2017 4/2017 5/2017 6/2017 7/2017 8/2017 9/2017 10/2017 11/2017 12/2017
## 1     62     59     45     45     48     37     41      40      44      62
## 2      0      0      0      0      0      0      0       0       0       0
## 3      1      3      3      3      4      2      2       1       1       2
## 4     11     10      8     10      6      8      5      10       7       9
## 5      2      4      2      2      1      2      1       0       0       3
## 6      0      0      0      1      0      1      0       0       0       1
##   1/2018 2/2018 3/2018 4/2018 5/2018 6/2018 7/2018 8/2018 9/2018 10/2018
## 1    114     58     62     52     50     46     49     39     39      49
## 2      0      0      0      0      0      0      0      0      0       0
## 3      6      2      2      0      2      3      1      1      2       3
## 4     13     10     10      9      8      7      6      8      7       8
## 5      3      4      3      2      2      2      1      1      5       3
## 6      0      0      1      1      2      0      0      0      1       0
##   11/2018 12/2018 1/2019 2/2019 3/2019 4/2019 5/2019 6/2019 7/2019 8/2019
## 1      41      49     60     61     56     60     38     43     48     45
## 2       1       0      0      0      0      0      0      0      0      0
## 3       3       2      4      2      2      4      0      1      3      0
## 4       8       6     10      8      8      7      7      8      6      6
## 5       2       1      3      3      2      1      1      4      0      2
## 6       0       0      2      0      0      1      1      1      0      1
##   9/2019 10/2019 11/2019 12/2019
## 1     38      44      47      68
## 2      0       0       0       0
## 3      2       1       2       3
## 4      5       5       8       7
## 5      3       1       1       0
## 6      1       0       1       2

Exploring the aggregated data

HOW MANY 0s DOES EACH COUNTY HAVE?

numzeros_total = c()
for (i in 1:58){
  numzeros_total[i] = length(which(total.mortality.ts[i,3:74] == 0))
}

numzeros_total
##  [1]  0 69  6  0 10 42  0  8  0  0 32  0  0 20  0  1  0 31  0  0  0 31  1  0 25
## [26] 62  0  0  1  0  0 25  0  0 13  0  0  0  0  0  0  0  0  0  0 61  2  0  0  0
## [51]  3  2 38  0  3  0  0  0
propzeros_total = numzeros_total/72

length(which(propzeros_total > 0.85))
## [1] 2
countycodes$value_represented[which(propzeros_total > 0.75)]
## [1] "Alpine" "Mono"   "Sierra"
hist(propzeros_total,breaks = 20 ,xlab = "Proportion of months with 0 deaths",main = "Do some counties have more strings of 0s than others?")

One aspect of the data that we wanted to examine before proceeding with our analysis was the frequency in which there were 0 deaths in a given county for a month. This would inform us about whether a standard Poisson model or a zero inflated Poisson model would be more appropriate. What I did above was first calculate the proportion of months (out of 72) that had 0 deaths observed for each county. Then identified which counties had a proportion of 0s greater than 75%, 85%, etc. Then, I made a histogram which shows there are only a couple of counties (which have very small populations) that had a high frequency of 0s. The aggregation performed in previous steps addressed the zero inflation it appears.

AGGREGATING MORTALITY DATA INTO CLUSTERS AS OPPOSED TO EACH COUNTY (ALSO AGGREGATED TO MORTALITY RATE PER CLUSTER)

Again, we want the number of respiratory related deaths at the cluster level, not the county level. So we once again aggregate the observations for each county in a given cluster. First, we simply add all the observations in cluster together to get a total number of respiratory related deaths for the months of 2014-2019 for each cluster. Then, we also calculated a mortality rate (per 100k people) for each cluster. This was done by taking the total number of deaths for a given cluster and dividing it by the total population of that cluster times 100,000 i.e. (deaths\(*\frac{100000}{clusterpop}\)). This second dataset will be used for our Gaussian process regression model which needs to be fit on a continuous response variable.

#County populations by year pulled from SoA data
countypops = CA_data %>% filter(Year > 2013) %>% select(Total_Pop,County,Year) %>% unique()
countypops = cbind(countypops,Cluster = rep(clusterlabels$Cluster,each=6))

cluster_mortality_total = matrix(NA,nrow = 72,ncol = num_clus)

cluster_mortality_rate = matrix(NA,nrow = 72,ncol = num_clus)

for (i in 1:num_clus){
  cluster = total.mortality.ts %>% filter(Cluster == i)
  year = 2014
  
  for(j in 1:72){
    col = cluster[,j+3]
    
    #Sum of deaths across counties in a cluster
    cluster_mortality_total[j,i] = sum(na.omit(col))
    
    #Rate of deaths (per 100,000) across counties in a cluster
    pops = countypops %>% filter(Year == year,Cluster == i) %>% select(Total_Pop)
    cluster.pop = sum(pops)
    cluster_mortality_rate[j,i] = (sum(na.omit(col))/cluster.pop)*100000
    
    if ((j>12) & (j<25)){
      year = 2015
    }
    
    else if ((j>24) & (j<37)){
      year = 2016
    }
    
    else if ((j>36) & (j<49)){
      year = 2017
    }
    
    else if ((j>48) & (j<61)){
      year = 2018
    }
    
    else if ((j>60) & (j<73)){
      year = 2019
    }
    
    else{
      year = 2014
    }
  }
}

#Time series of total deaths for each cluster
colnames(cluster_mortality_total) = c("Cluster 1","Cluster 2")
rownames(cluster_mortality_total) = colnames(total.mortality.ts[4:75])
cluster_mortality_total = data.frame(cluster_mortality_total)

head(cluster_mortality_total)
##        Cluster.1 Cluster.2
## 1/2014      2092       150
## 2/2014      1626        96
## 3/2014      1567        94
## 4/2014      1341        93
## 5/2014      1301        93
## 6/2014      1180        71
#Time series of rate of deaths (per 100,000) for each cluster
colnames(cluster_mortality_rate) = c("Cluster 1","Cluster 2")
rownames(cluster_mortality_rate) = colnames(total.mortality.ts[4:75])
cluster_mortality_rate = data.frame(cluster_mortality_rate)

head(cluster_mortality_rate)
##        Cluster.1 Cluster.2
## 1/2014  5.852932  6.078488
## 2/2014  4.549172  3.890232
## 3/2014  4.384103  3.809186
## 4/2014  3.751808  3.768662
## 5/2014  3.639897  3.768662
## 6/2014  3.301367  2.877151

MAKE A TIME SERIES FOR EACH CLUSTER:

plot(ts(cluster_mortality_total$Cluster.1),xlab = "2014-2019 (Months)",ylab = "Respiratory related deaths", main = "Monthly population weighted means for Cluster 1")

plot(ts(cluster_mortality_total$Cluster.2),xlab = "2014-2019 (Months)",ylab = "Respiratory related deaths", main = "Monthly population weighted means for Cluster 2")

Fitting INLA models: spatial GLMM, Besag-York-Mollie (BYM) model, LGCPs

Now that all of the data from the SoA (used for SKATER and HUGE to get graph filter H), EPA (used to get gram matrix K), and Cal-ViDa (response) is downloaded, cleaned, and well formatted, we can now fit our kernel graph regression model as well as a few reference models, which we will compare against each other. For now, I have implemented a training-test data fitting approach to evaluating model performance.

First, I created an in sample dataset (inla_insample_data) which has variables ID (which represents the cluster label), ID2 (which is basically an index label), response (cluster mortality), time (time index label 1-66), and months (month label 1-12). Then, I create an out of sample dataset (inla_outsample_data) in which I decided to hold out the last 6 months of the data (67-72 or July-Dec 2019) so I replaced those response values with NAs. This is how you get INLA to make predictions/forecasts because it does so based on the posterior predictive distribution.

cluster_mortality_total_red = cluster_mortality_total[13:72,]
cluster_mortality_rate_red = cluster_mortality_rate[13:72,]

response = t(cluster_mortality_total_red)
response = as.vector(response)
response = ceiling(response)

response2 = t(cluster_mortality_rate_red)
response2 = as.vector(response2)

id = rep(c(1:num_clus),60)
id2 = 1:(num_clus*60)
time = rep(c(1:60),each = num_clus)

inla_full_data = data.frame(id,id2,response,time)

inla_full_data2 = data.frame(id,id2,response2,time)

months = rep(c(1:12),each = num_clus)
months = rep(months,5)
inla_full_data = cbind(inla_full_data,months)

#Experimented with defining each of these as factors
# inla_full_data$id = factor(inla_full_data$id) 
# inla_full_data$id2 = factor(inla_full_data$id2)
# inla_full_data$time = factor(inla_full_data$time)
inla_full_data$months = factor(inla_full_data$months)

#Add multiple intercept columns, one for each cluster 
Intercept1 = rep(c(1,NA),60)
Intercept2 = rep(c(NA,1),60)

inla_full_data = cbind(inla_full_data,Intercept1,Intercept2)

# response = replace_na(response,0.01)
inla_gp_data = inla_full_data2

year = rep(2015:2019,each = 12)
inla_gp_data = cbind(inla_gp_data,year)

###Split into in sample and out of sample dataset
inla_outsample_data = inla_full_data

#Omit values for months 61-66 (out of sample dataset)
omit_idx = which(inla_outsample_data$time > 54)
inla_outsample_data$response[omit_idx] = NA
inla_insample_data = inla_full_data[-omit_idx,]

omit_idx = which(inla_gp_data$time > 54)
inla_gp_data$response2[omit_idx] = NA

IN SAMPLE FITTING ANALYSIS

Fit a simple Poisson GLMM for our mortality data (Reference model 1)

We wanted to compare the performance of our proposed model with a few reference models. The first one is a Poisson generalized linear mixed model. This model assumes the observed data follows a Poisson distribution and the hyperparameter \(\lambda_i\) can be modeled using a mixed effects model with a log link.

In other words,

\(Y_{c,t} \sim Pois(\Lambda_{c,t})\) for \(c=1,2\) and \(t=1,...,54\) where \(\Lambda_{c,t} = \exp(\beta_{c1} I \{ c=1 \} + \beta_{c2} I \{ c=2 \} + \beta_1 I \{t \, mod \, 12 = 1 \} + ... + \beta_{11} I \{t \, mod \, 12 = 11 \} + \textbf{F}_c)\)

where the random effect \(\textbf{F} | \tau \sim \mathcal{GP}(\textbf{0},\tau \boldsymbol{\Sigma})\)

We wanted the first reference model to be simple, so we assumed that the random effects \(\textbf{F}_c\) are iid. This means that \(\Sigma\) is simply a diagonal matrix of scaling factors. The hyperparameter \(log(\tau)\) is by default assigned a \(log \; \Gamma(1,0.00005)\) prior.

#Write a function to fit our poisson glmm in INLA 
ref_model1 = function(dataset,a_prior = 1,b_prior = 5e-05,link=1){
  ###Fit INLA model 
  prec_prior <- list(prec = list(prior = "loggamma", param = c(a_prior,b_prior)))
  ref_formula1 = response ~ -1 + months + Intercept1 + Intercept2 + f(id,model = "iid", hyper = prec_prior) #could use id or id2 
  model = inla(formula = ref_formula1,family = "poisson",data = dataset,
                  control.compute = list(dic=TRUE,waic=TRUE),
                  control.inla = list(strategy = "laplace"),
                  control.predictor = list(compute = TRUE, link = link))
  
  ###Extract relevant information and store in the list
  
  model_summary <- model$summary.fixed
  bri_hyperpar_summary <- bri.hyperpar.summary(model)
  model_DIC <- model$dic$dic
  model_WAIC <- model$waic$waic
  preds_model <- model$summary.fitted.values
  preds_model <- cbind(dataset$id, dataset$time, preds_model)
  colnames(preds_model) <- c("id", "time", "mean", "sd", "0.025quant", "0.5quant", "0.975quant", "mode")
  
  #Exponentiating parameter to get better interpretation of estimates 
  multeff <- exp(model$summary.fixed$mean)
  names(multeff) <- model$names.fixed
  
  #Plot of each parameters' posterior density 
  mf <- melt(model$marginals.fixed)
  cf <- spread(mf,Var2,value)
  names(cf)[2] <- 'parameter'
  param_plot = ggplot(cf,aes(x=x,y=y)) + geom_line()+facet_wrap(~ parameter, 
             scales="free") + geom_vline(xintercept=0) + ylab("density")
  
  #Plot of precision of random effect (main hyperparameter of interest)
  sden <- data.frame(bri.hyper.sd(model$marginals.hyperpar[[1]]))
  hyperparam_plot = ggplot(sden,aes(x,y)) + geom_line() + ylab("density") + 
             xlab("linear predictor")
  
  #Store the results in the list
  ref_model1_results = list(
    model_summary = model_summary,
    bri_hyperpar_summary = bri_hyperpar_summary,
    exp_effects = multeff,
    param_plot = param_plot,
    hyperparam_plot = hyperparam_plot,
    model_DIC = model_DIC,
    model_WAIC = model_WAIC,
    fitted_values = preds_model
  )
  
  return(ref_model1_results)
}

#Run model 
ref_model1_fit = ref_model1(dataset = inla_insample_data,a_prior = 1,b_prior = 0.00001)

#Extract DIC and WAIC
ref_model1_DIC = ref_model1_fit$model_DIC
ref_model1_WAIC = ref_model1_fit$model_WAIC

#Get summaries of parameter estimates
ref_model1_fit$model_summary
##                 mean       sd 0.025quant  0.5quant 0.975quant      mode
## months1    1.2973863 8.451485  -15.27728 1.2973863   17.87205 1.2973863
## months2    1.0552637 8.451486  -15.51941 1.0552637   17.62993 1.0552637
## months3    1.0746046 8.451486  -15.50006 1.0746046   17.64927 1.0746046
## months4    0.8864129 8.451487  -15.68826 0.8864129   17.46108 0.8864129
## months5    0.8057920 8.451487  -15.76888 0.8057920   17.38046 0.8057920
## months6    0.6903558 8.451488  -15.88432 0.6903558   17.26503 0.6903558
## months7    0.6615820 8.451491  -15.91310 0.6615820   17.23626 0.6615820
## months8    0.6278468 8.451491  -15.94683 0.6278468   17.20253 0.6278468
## months9    0.5942249 8.451491  -15.98045 0.5942249   17.16890 0.5942249
## months10   0.6657951 8.451491  -15.90888 0.6657951   17.24047 0.6657951
## months11   0.7096635 8.451490  -15.86501 0.7096635   17.28434 0.7096635
## months12   0.9748683 8.451488  -15.59980 0.9748683   17.54954 0.9748683
## Intercept1 6.4028461 8.451483  -10.17182 6.4028461   22.97751 6.4028461
## Intercept2 3.6409497 8.451492  -12.93373 3.6409497   20.21563 3.6409497
##                     kld
## months1    5.527855e-11
## months2    5.527834e-11
## months3    5.527834e-11
## months4    5.527833e-11
## months5    5.527812e-11
## months6    5.527851e-11
## months7    5.527848e-11
## months8    5.527847e-11
## months9    5.527847e-11
## months10   5.527828e-11
## months11   5.527848e-11
## months12   5.527831e-11
## Intercept1 5.527838e-11
## Intercept2 5.527825e-11
ref_model1_fit$bri_hyperpar_summary
##                  mean          sd      q0.025        q0.5     q0.975
## SD for id 0.005177213 0.004471645 0.001635036 0.003776738 0.01824025
##                  mode
## SD for id 0.002548752
ref_model1_fit$exp_effects
##    months1    months2    months3    months4    months5    months6    months7 
##   3.659719   2.872733   2.928835   2.426410   2.238469   1.994425   1.937856 
##    months8    months9   months10   months11   months12 Intercept1 Intercept2 
##   1.873572   1.811626   1.946037   2.033307   2.650818 603.560401  38.128030
#Show plots
ref_model1_fit$param_plot

ref_model1_fit$hyperparam_plot

Note: SD for ID: standard deviation for the means (avg intensities) corresponding to the 8 different clusters was 1.84

Plot of posterior predictive estimates with credible interval bands OVERLAID on response:

#Write a function to make plot of posterior predictive estimates with credible interval bands OVERLAID on response
pp_insample_plot = function(num_plots = num_clus, ref_data = inla_insample_data, pred_data){
  for (i in 1:num_plots){
  df = ref_data %>% filter(id == i) %>% select(response)
  preds = pred_data %>% filter(id == i) 
  df = cbind(df,preds)
  
  # title = sprintf("Posterior Predictive Fits for Cluster %s",i)
  title = sprintf("Cluster %s",i)
  
  
  post_pred_plot = df %>% ggplot(aes(x=time,y=response)) + geom_point() + 
    geom_line(aes(y=mean),color = "red") + geom_ribbon(aes(ymin = `0.025quant`,ymax = `0.975quant`),alpha = 0.3) + ggtitle(title)
  print(post_pred_plot)
  }
}
#Plot ref_model1 pp plot  
pp_insample_plot(pred_data = ref_model1_fit$fitted_values)

Fitting a Besag-York-Mollie model (Reference model 2)

For our second reference model, we decided to fit a Besag-York-Mollie model, which is a log-normal Poisson model with an intrinsic conditional autoregressive component to capture spatial autocorrelations i.e. a Besag model, plus a standard random effects term which is included to capture non-spatial heterogeneity. Obviously, this model is less naive than reference model 1 because it does not assume iid random effects.

The BYM model can be written as,

\(Y_{c,t} \sim Pois(\Lambda_{c,t})\) for \(c=1,2\) and \(t=1,...,54\) where \(\Lambda_{c,t} | \textbf{S} = \exp(\beta_{c1} I \{ c=1 \} + \beta_{c2} I \{ c=2 \} + \beta_1 I \{t \, mod \, 12 = 1 \} + ... + \beta_{11} I \{t \, mod \, 12 = 11 \} + \phi_c + \textbf{F}_c)\)

where \(p(\boldsymbol{\phi}) \propto \exp(-\frac{1}{2} \sum_{c_1 \sim c_2} (\phi_{c_1} - \phi_{c_2})^2)\) and \(\textbf{F} | \; \textbf{S}, \tau \sim \mathcal{GP}(\textbf{0},\tau \boldsymbol{\Sigma})\).

Note: it is more commonly known that ICAR components are conditionally normally distributed.

As one can see below, the summary outputs indicate that this model is very similar to the Poisson GLMM (reference model 1). The intercept and SD for the random effect component are estimated to almost the exact same as those estimated by the Poisson GLMM, indicating that including the spatial ICAR component is seemingly not very impactful.

#Write a function to fit our BYM model in INLA 
ref_model2 = function(dataset,a_prec_prior = 1,b_prec_prior = 5e-04,a_phi_prior = 1,b_phi_prior = 5e-04,link=1){
  ###Fit INLA model 
  bym_prior <- list(
  prec.unstruct = list(
    prior = "loggamma",
    param = c(a_prec_prior,b_prec_prior)),
  prec.spatial = list(
    prior = "loggamma",
    param = c(a_phi_prior,b_phi_prior))
  )
  ref_formula2 = response ~ -1 + months + Intercept1 + Intercept2 + 
    f(id, model = "bym", graph = huge.est, hyper = bym_prior) #ID2 in formula results in error 
  model = inla(formula = ref_formula2,family = "poisson",data = dataset,
                  control.compute = list(dic=TRUE,waic=TRUE),
                  control.inla = list(strategy = "laplace"),
                  control.predictor = list(compute = TRUE, link = link))
  
  ###Extract relevant information and store in the list
  
  model_summary <- model$summary.fixed
  bri_hyperpar_summary <- bri.hyperpar.summary(model)
  model_DIC <- model$dic$dic
  model_WAIC <- model$waic$waic
  preds_model <- model$summary.fitted.values
  preds_model <- cbind(dataset$id, dataset$time, preds_model)
  colnames(preds_model) <- c("id", "time", "mean", "sd", "0.025quant", "0.5quant", "0.975quant", "mode")
  
  #Exponentiating parameter to get better interpretation of estimates 
  multeff <- exp(model$summary.fixed$mean)
  names(multeff) <- model$names.fixed
  
  #Plot of each parameters' posterior density 
  mf <- melt(model$marginals.fixed)
  cf <- spread(mf,Var2,value)
  names(cf)[2] <- 'parameter'
  param_plot = ggplot(cf,aes(x=x,y=y)) + geom_line()+facet_wrap(~ parameter, 
             scales="free") + geom_vline(xintercept=0) + ylab("density")
  
  #Plot of precision of random effect (main hyperparameter of interest)
  sden <- data.frame(bri.hyper.sd(model$marginals.hyperpar[[1]]))
  hyperparam_plot = ggplot(sden,aes(x,y)) + geom_line() + ylab("density") + 
             xlab("linear predictor")
  
  #Store the results in the list
  ref_model2_results = list(
    model_summary = model_summary,
    bri_hyperpar_summary = bri_hyperpar_summary,
    exp_effects = multeff,
    param_plot = param_plot,
    hyperparam_plot = hyperparam_plot,
    model_DIC = model_DIC,
    model_WAIC = model_WAIC,
    fitted_values = preds_model
  )
  
  return(ref_model2_results)
}

#Fit ref_model2
ref_model2_fit = ref_model2(dataset = inla_insample_data,a_prec_prior = 1,b_prec_prior = 1e-5,
                            a_phi_prior = 1,b_phi_prior = 1)

#Extract DIC and WAIC
ref_model2_DIC = ref_model2_fit$model_DIC
ref_model2_WAIC = ref_model2_fit$model_WAIC

#Get summaries of parameter estimates
ref_model2_fit$model_summary
##                 mean       sd 0.025quant  0.5quant 0.975quant      mode
## months1    1.2973863 8.451485  -15.27728 1.2973863   17.87205 1.2973863
## months2    1.0552637 8.451486  -15.51941 1.0552637   17.62993 1.0552637
## months3    1.0746046 8.451486  -15.50006 1.0746046   17.64927 1.0746046
## months4    0.8864129 8.451487  -15.68826 0.8864129   17.46108 0.8864129
## months5    0.8057920 8.451487  -15.76888 0.8057920   17.38046 0.8057920
## months6    0.6903558 8.451488  -15.88432 0.6903558   17.26503 0.6903558
## months7    0.6615820 8.451491  -15.91310 0.6615820   17.23626 0.6615820
## months8    0.6278468 8.451491  -15.94683 0.6278468   17.20253 0.6278468
## months9    0.5942249 8.451491  -15.98045 0.5942249   17.16890 0.5942249
## months10   0.6657951 8.451491  -15.90888 0.6657951   17.24047 0.6657951
## months11   0.7096635 8.451490  -15.86501 0.7096635   17.28434 0.7096635
## months12   0.9748683 8.451488  -15.59980 0.9748683   17.54954 0.9748683
## Intercept1 6.4000903 8.510644  -10.29332 6.4002670   23.09245 6.4005594
## Intercept2 3.6437055 8.510653  -13.04867 3.6435288   20.33713 3.6432365
##                     kld
## months1    5.527855e-11
## months2    5.527834e-11
## months3    5.527834e-11
## months4    5.527813e-11
## months5    5.527832e-11
## months6    5.527851e-11
## months7    5.527848e-11
## months8    5.527827e-11
## months9    5.527847e-11
## months10   5.527828e-11
## months11   5.527828e-11
## months12   5.527831e-11
## Intercept1 4.017159e-13
## Intercept2 4.019112e-13
ref_model2_fit$bri_hyperpar_summary
##                                      mean         sd      q0.025       q0.5
## SD for id (idd component)     0.004718937 0.00287443 0.001729256 0.00388754
## SD for id (spatial component) 1.490534842 0.90747828 0.546594376 1.22806346
##                                   q0.975        mode
## SD for id (idd component)     0.01248449 0.002815953
## SD for id (spatial component) 3.94214878 0.890258229
ref_model2_fit$exp_effects
##    months1    months2    months3    months4    months5    months6    months7 
##   3.659719   2.872733   2.928835   2.426410   2.238469   1.994425   1.937856 
##    months8    months9   months10   months11   months12 Intercept1 Intercept2 
##   1.873572   1.811626   1.946037   2.033307   2.650818 601.899412  38.233247
#Show plots
ref_model2_fit$param_plot

ref_model2_fit$hyperparam_plot

pp_insample_plot(pred_data = ref_model2_fit$fitted_values)

Fitting kernel graph regression models

KGR model with time series kernel x graph filter (Proposed model 2)

Finally, we fit our proposed model which we call a kernel graph regression model. It also takes the form of a latent Gaussian model as shown below:

\(\Lambda_{c,t} | \textbf{F}, \textbf{S} = \exp(\beta_{c1} I \{ c=1 \} + \beta_{c2} I \{ c=2 \} + \beta_1 I \{t \, mod \, 12 = 1 \} + ... + \beta_{11} I \{t \, mod \, 12 = 11 \} + \textbf{F}_{c,t})\)

where the graph signal \(\textbf{F} | \textbf{S}, \rho_{rbf}, \rho_{p}, \sigma^2_{EPA} \sim \mathcal{GP}(\textbf{0},\textbf{K}_{EPA} \otimes \textbf{H}^2)\) with \(Cov(F_{c_1,t_1},F_{c_2,t_2}) = \left[ \textbf{K}^{EPA} \right]_{t_1,t_2} \left[ \textbf{H}^2 \right]_{c_1,c_2}\).

The key difference here is that the covariance matrix of this GP is specified by the kronecker product of \(K\), which is the time kernel gram matrix calculated from the EPA air quality data, and \(H\), which is the graph filter which is calculated from the adjacency matrix estimated by glasso using the HUGE package. This matrix is completely known and can be directly plugged into INLA as the covariance matrix of our underlying GP using the “generic0” specification as shown below:

Calculating gram matrix K from EPA data

Using the EPA air quality data, we can calculate the gram matrix K which will characterize the dependence structure of air quality (across 7 different pollutants and AQI) over time. This is done by calculating the squared difference between all of the observations at two different time points e.g. 16 observations for Jan 2014 compared with the 64 observations for Feb 2014. For the in sample analysis, the resulting matrix is 54x54 because we are holding out the last 6 months of observations.

EPA_kernel = function(EPA_data = final_EPA_agg_data,time_span,rho_rbf,rho_periodic,sigma2){
  K_EPA = matrix(0,nrow=time_span,ncol=time_span)
  i = 1
  j = 1
  
  for(t1 in 1:time_span){
    for (t2 in 1:time_span){
      A = EPA_data %>% filter(Time == t1)
      B = EPA_data %>% filter(Time == t2)
      AQIa = unique(A$AQI)
      AQIb = unique(B$AQI)
      
      ABtest = c((A$Value-B$Value)^2,(AQIa-AQIb)^2) #2 clusters * 8 measurements 
      # K_EPA[i,j] = exp(-sum(ABtest) / (2*rho_rbf)) * sigma2
      
      # K_EPA[i,j] = exp(- (sum(ABtest)) ###square this sum or remove it??? 
      #                / (2*rho_rbf)) * exp(- (2*sin(sum(abs(ABtest))*pi/12)^2)
      #                / (rho_periodic)) * sigma2
      
      K_EPA[i,j] = exp(- (mean(ABtest)) ###mean or sum??? 
               / (2*rho_rbf)) * exp(- (2*sin(sum(abs(ABtest))*pi/12)^2)
               / (rho_periodic)) * sigma2
      
      j = j+1
    }
    
    j = 1
    i = i+1
  }
  
  return(K_EPA)
}

Ensuring precision matrix is not computationally singular, so we jitter eigenvalues, using reciprocal condition number as constraint

desingularize = function(covmatrix,threshold = 1e-2,increment = 0.01){
  
  tracker = 0
  
  while (rcond(covmatrix) <= threshold){
    #Perform spectral decomposition
    ev = eigen(covmatrix)
    L = ev$values
    V = ev$vectors
    
    # #Add a little noise to eigenvalues to bring away from 0
    L = L + increment
    
    # #Add 0.01 to eigenvalues in bottom five percentile to bring away from 0
    # cutoff = quantile(abs(L),0.05)
    # L[which(abs(L) < cutoff)] = L[which(abs(L) < cutoff)] + 0.01
    
    #Calculate new precision matrix 
    covmatrix = V %*% diag(L) %*% t(V)
    
    tracker = tracker + 1
  }
  
  results_list = list(covmatrix,tracker)
  #sprintf("%s iterations of desingularizer applied",tracker)
  return(results_list)
}

# test = desingularize(K_time)

GLMM with type 0 generic specification (known covariance matrix)

kgr_model2 = function(dataset, rho_EPA_rbf = 1, rho_EPA_periodic = 1, sigma2_EPA = 1,link=1){
  
  #Calculate gram matrix K_EPA
  K_EPA = EPA_kernel(time_span = length(unique(dataset$time)),
                     rho_rbf = rho_EPA_rbf, rho_periodic = rho_EPA_periodic, sigma2 = sigma2_EPA)
  
  #Heatmap of resulting K 
  K_EPA_heatmap = matrix_heatmap(K_EPA,title = "K_EPA Heatmap")
  
  #Calculate trace norm of gram matrix
  K_EPA_weight = norm((1/60)*K_EPA,type = "F")
  
  ###Load graph regression kernel 
  covGP = kronecker(K_EPA/60,(H^2)/7)
  
  #Need to ensure precision matrix is not computationally singular i.e det > 0
  covGP_jittered = desingularize(covGP,threshold = 1e-2,increment = 0.01)
  covGP = covGP_jittered[[1]]
  inv_covGP = solve(covGP)
  # cov_Fnorm = norm(covGP,type = "F")
  
  #Heatmap of resulting K 
  inv_covGP_heatmap = matrix_heatmap(inv_covGP,title = "Precision matrix heatmap")
  
  ###Fit INLA model 
  # kgr_formula2 = response ~ -1 + Intercept1 + Intercept2 + f(id2,model = "generic0",Cmatrix = inv_covGP)
  
  kgr_formula2 = response ~ -1 + months + Intercept1 + Intercept2 + f(id2,model = "generic0",Cmatrix = inv_covGP)
  
  
  model = inla(formula = kgr_formula2,family = "poisson",data = dataset,
                  control.compute = list(dic=TRUE,waic=TRUE,
                                         return.marginals.predictor=TRUE),
                  control.inla = list(strategy = "laplace"),
                  control.predictor = list(compute = TRUE, link = link))
  
  ###Extract relevant information and store in the list
  model_summary <- model$summary.fixed
  bri_hyperpar_summary <- bri.hyperpar.summary(model)
  model_DIC <- model$dic$dic
  model_WAIC <- model$waic$waic
  preds_model <- model$summary.fitted.values
  preds_model <- cbind(dataset$id, dataset$time, preds_model)
  colnames(preds_model) <- c("id", "time", "mean", "sd", "0.025quant", "0.5quant", "0.975quant", "mode")
  marginal_fvs <- model$marginals.fitted.values

  
  #Exponentiating parameter to get better interpretation of estimates 
  multeff <- exp(model$summary.fixed$mean)
  names(multeff) <- model$names.fixed
  
  #Plot of each parameters' posterior density 
  mf <- melt(model$marginals.fixed)
  cf <- spread(mf,Var2,value)
  names(cf)[2] <- 'parameter'
  param_plot = ggplot(cf,aes(x=x,y=y)) + geom_line()+facet_wrap(~ parameter, 
             scales="free") + geom_vline(xintercept=0) + ylab("density")
  
  #Plot of precision of random effect (main hyperparameter of interest)
  sden <- data.frame(bri.hyper.sd(model$marginals.hyperpar[[1]]))
  hyperparam_plot = ggplot(sden,aes(x,y)) + geom_line() + ylab("density") + 
             xlab("linear predictor")
  
  #Store the results in the list
  kgr_model2_results = list(
    K_EPA_heatmap = K_EPA_heatmap,
    K_EPA_weight = K_EPA_weight/(K_EPA_weight + gfilter_weight),
    gfilter_weight = gfilter_weight/(K_EPA_weight + gfilter_weight),
    covmatrix = covGP,
    prec = inv_covGP,
    num_jitters = covGP_jittered[[2]],
    prec_heatmap = inv_covGP_heatmap,
    model_summary = model_summary,
    bri_hyperpar_summary = bri_hyperpar_summary,
    exp_effects = multeff,
    param_plot = param_plot,
    hyperparam_plot = hyperparam_plot,
    model_DIC = model_DIC,
    model_WAIC = model_WAIC,
    fitted_values = preds_model,
    marg_fitted_values = marginal_fvs
  )
  
  return(kgr_model2_results)
}

#Fit kgr_model2
kgr_model2_fit = kgr_model2(dataset = inla_insample_data, rho_EPA_rbf = 125.474,
                            rho_EPA_periodic = 2.066, sigma2_EPA = 4.928)

#Extract DIC and WAIC 
kgr_model2_DIC = kgr_model2_fit$model_DIC
kgr_model2_WAIC = kgr_model2_fit$model_WAIC

#Get summaries of parameter estimates
kgr_model2_fit$model_summary
##                 mean       sd 0.025quant  0.5quant 0.975quant      mode
## months1    1.2812358 8.451518  -15.29350 1.2812358   17.85597 1.2812358
## months2    1.0498771 8.451520  -15.52486 1.0498771   17.62461 1.0498771
## months3    1.0865945 8.451517  -15.48814 1.0865945   17.66133 1.0865945
## months4    0.8859034 8.451524  -15.68884 0.8859034   17.46065 0.8859034
## months5    0.8306464 8.451522  -15.74409 0.8306464   17.40539 0.8306464
## months6    0.6889292 8.451526  -15.88582 0.6889292   17.26368 0.6889292
## months7    0.6581345 8.451531  -15.91662 0.6581345   17.23289 0.6581345
## months8    0.6302336 8.451539  -15.94454 0.6302336   17.20501 0.6302336
## months9    0.5874405 8.451533  -15.98732 0.5874405   17.16220 0.5874405
## months10   0.6607466 8.451536  -15.91402 0.6607466   17.23551 0.6607466
## months11   0.7188343 8.451532  -15.85593 0.7188343   17.29359 0.7188343
## months12   0.9632655 8.451531  -15.61149 0.9632655   17.53802 0.9632655
## Intercept1 6.4010548 8.451502  -10.17365 6.4010548   22.97575 6.4010548
## Intercept2 3.6407866 8.451511  -12.93393 3.6407866   20.21551 3.6407866
##                     kld
## months1    5.527832e-11
## months2    5.527849e-11
## months3    5.527852e-11
## months4    5.527844e-11
## months5    5.527827e-11
## months6    5.527841e-11
## months7    5.527834e-11
## months8    5.527844e-11
## months9    5.527811e-11
## months10   5.527828e-11
## months11   5.527813e-11
## months12   5.527815e-11
## Intercept1 5.527838e-11
## Intercept2 5.527839e-11
kgr_model2_fit$bri_hyperpar_summary
##                 mean         sd    q0.025      q0.5    q0.975      mode
## SD for id2 0.4540322 0.04949595 0.3658247 0.4508021 0.5600683 0.4448735
kgr_model2_fit$exp_effects
##    months1    months2    months3    months4    months5    months6    months7 
##   3.601087   2.857300   2.964162   2.425174   2.294802   1.991582   1.931186 
##    months8    months9   months10   months11   months12 Intercept1 Intercept2 
##   1.878049   1.799377   1.936237   2.052040   2.620239 602.480221  38.121811
kgr_model2_fit$K_EPA_weight
## [1] 0.9771208
kgr_model2_fit$gfilter_weight
## [1] 0.02287917
kgr_model2_fit$num_jitters
## [1] 2
#Show plots
kgr_model2_fit$prec_heatmap

kgr_model2_fit$K_EPA_heatmap

kgr_model2_fit$param_plot

kgr_model2_fit$hyperparam_plot

pp_insample_plot(pred_data = kgr_model2_fit$fitted_values)

We can also simplify the covariance of our underlying GP and see how our proposed model compares with a simplified version with a simple time kernel:

\(\Lambda_{c,t} | \textbf{F} = \exp(\beta_{c1} I \{ c=1 \} + \beta_{c2} I \{ c=2 \} + \textbf{F}_{t})\)

where the graph signal \(\textbf{F} | \rho_{rbf}, \rho_{p}, \sigma^2_{time} \sim \mathcal{GP}(\textbf{0},\textbf{K}_{time})\) with \(Cov(F_{c_1,t_1},F_{c_2,t_2}) = \left[ \textbf{K}^{time} \right]_{t_1,t_2}\).

Instead of calculating gram matrix K based on covariate (EPA variables) similarity, our gram matrix K is simply a time kernel where similar values of t (months 1-60) have larger covariances. As a result, this model has no spatial dependence structure built in.

Calculating simple time kernel K

time_kernel = function(time_span,rho_rbf,rho_periodic,sigma2){
  K_time = matrix(NA,nrow = time_span, ncol = time_span)
  
  for (i in 1:time_span){
    for (j in 1:time_span){
      # K_time[i,j] = exp(- (abs(i-j)^2) / (2*rho)) * sigma2
      
      K_time[i,j] = exp(- (abs(i-j)^2) / (2*rho_rbf)) * exp(- (2*sin(sum(abs(i-j))*3.14/12)^2)
                     / (rho_periodic)) * sigma2
    }
  }
  
  return(K_time)
}

LGCP with temporal kernel (Reference model 3)

Since there is no spatial component in this model, each cluster can be fit separately.

ref_model3 = function(dataset, cluster, rho_time_rbf = 1, rho_time_periodic = 1, sigma2_time = 1,link=1){
  
  #Calculating gram matrix K_time
  K_time = time_kernel(time_span = length(unique(dataset$time)),rho_rbf = rho_time_rbf, 
                       rho_periodic = rho_time_periodic, sigma2 = sigma2_time)
  
  #Heatmap of resulting K 
  K_time_heatmap = matrix_heatmap(K_time,title = "K_time heatmap")
  
  #Calculate trace norm of gram matrix
  K_time_weight = norm(K_time,type = "F")
  
  #Need to ensure precision matrix is not computationally singular i.e det > 0
  covGP_jittered = desingularize(K_time,threshold = 1e-2,increment = 0.01)
  K_time = covGP_jittered[[1]]
  inv_K_time = solve(K_time)
  # cov_Fnorm = norm(K_time,type = "F")

  
  #Heatmap of resulting inv_K_time 
  inv_K_time_heatmap = matrix_heatmap(inv_K_time,title = "Precision matrix heatmap")
  
  
  ###Fitting the model on each cluster 
  inla_test_clus_data = dataset %>% filter(id == cluster)
  
  ref_formula3 = response ~ -1 + Intercept1 + Intercept2 + f(time,model = "generic0",Cmatrix = inv_K_time)
  
  model = inla(ref_formula3, data = inla_test_clus_data, family = "poisson",
                    control.compute = list(dic=TRUE,waic=TRUE),
                    control.inla = list(strategy = "laplace"),
                    control.predictor = list(compute = TRUE, link = link))
  
  ###Extract relevant information and store in the list
  model_summary <- model$summary.fixed
  bri_hyperpar_summary <- bri.hyperpar.summary(model)
  model_DIC <- model$dic$dic
  model_WAIC <- model$waic$waic
  preds_model <- model$summary.fitted.values
  preds_model <- cbind(inla_test_clus_data$id, inla_test_clus_data$time, preds_model)
  colnames(preds_model) <- c("id", "time", "mean", "sd", "0.025quant", "0.5quant", "0.975quant", "mode")
  
  #Exponentiating parameter to get better interpretation of estimates 
  multeff <- exp(model$summary.fixed$mean)
  names(multeff) <- model$names.fixed
  
  #Plot of each parameters' posterior density 
  mf <- melt(model$marginals.fixed)
  cf <- spread(mf,Var2,value)
  names(cf)[2] <- 'parameter'
  param_plot = ggplot(cf,aes(x=x,y=y)) + geom_line()+facet_wrap(~ parameter, 
             scales="free") + geom_vline(xintercept=0) + ylab("density")
  
  #Plot of precision of random effect (main hyperparameter of interest)
  sden <- data.frame(bri.hyper.sd(model$marginals.hyperpar[[1]]))
  hyperparam_plot = ggplot(sden,aes(x,y)) + geom_line() + ylab("density") + 
             xlab("linear predictor")
  
  #Store the results in the list
  ref_model3_results = list(
    K_time_heatmap = K_time_heatmap,
    K_time_weight = K_time_weight/(K_time_weight + gfilter_weight),
    gfilter_weight = gfilter_weight/(K_time_weight + gfilter_weight),
    covmatrix = K_time,
    prec = inv_K_time,
    num_jitters = covGP_jittered[[2]],
    prec_heatmap = inv_K_time_heatmap,
    model_summary = model_summary,
    bri_hyperpar_summary = bri_hyperpar_summary,
    exp_effects = multeff,
    param_plot = param_plot,
    hyperparam_plot = hyperparam_plot,
    model_DIC = model_DIC,
    model_WAIC = model_WAIC,
    fitted_values = preds_model
  )
  
  return(ref_model3_results)
}

#Fit ref_model3 on one cluster (to test)
ref_model3_fit = ref_model3(dataset = inla_insample_data, cluster = 2, rho_time_rbf = 1,
                   rho_time_periodic = 1, sigma2_time = 5)

#Extract DIC and WAIC
ref_model3_DIC = ref_model3_fit$model_DIC
ref_model3_WAIC = ref_model3_fit$model_WAIC

#Get summaries of parameter estimates
ref_model3_fit$model_summary
##                    mean          sd 0.025quant      0.5quant 0.975quant
## Intercept1 5.572305e-15 31.62254149 -62.016688 -4.615443e-15  62.016688
## Intercept2 4.496385e+00  0.04277138   4.411621  4.496549e+00   4.580233
##                     mode          kld
## Intercept1 -2.513205e-21 5.527836e-11
## Intercept2  4.496873e+00 9.973219e-09
ref_model3_fit$bri_hyperpar_summary
##                   mean         sd     q0.025       q0.5   q0.975       mode
## SD for time 0.08875563 0.01141065 0.06879227 0.08788005 0.113584 0.08622776
ref_model3_fit$exp_effects
## Intercept1 Intercept2 
##    1.00000   89.69235
ref_model3_fit$K_time_weight
## [1] 0.9984487
ref_model3_fit$gfilter_weight
## [1] 0.0015513
ref_model3_fit$num_jitters
## [1] 0
#Show plots
ref_model3_fit$prec_heatmap

ref_model3_fit$K_time_heatmap

ref_model3_fit$param_plot

ref_model3_fit$hyperparam_plot

test1 = ref_model3(dataset = inla_insample_data, cluster = 1, rho_time_rbf = 207.105,
                   rho_time_periodic = 5.302, sigma2_time = 2.879)
test2 = ref_model3(dataset = inla_insample_data, cluster = 2, rho_time_rbf = 16.827,
                   rho_time_periodic = 212.333, sigma2_time = 1.053)

ref_model3_fvs = rbind(test1$fitted_values,test2$fitted_values)

pp_insample_plot(num_plots = num_clus,ref_data = inla_insample_data,pred_data = ref_model3_fvs)

KGR model with temporal kernel x graph filter (Proposed model 1)

\(\Lambda_{c,t} | \textbf{F}, \textbf{S} = \exp(\beta_{c1} I \{ c=1 \} + \beta_{c2} I \{ c=2 \} + \beta_1 I \{t \, mod \, 12 = 1 \} + ... + \beta_{11} I \{t \, mod \, 12 = 11 \} + \textbf{F}_{c,t})\)

where the graph signal \(\textbf{F} | \textbf{S}, \rho_{rbf}, \rho_{p}, \sigma^2_{time} \sim \mathcal{GP}(\textbf{0},\textbf{K}_{time} \otimes \textbf{H}^2)\) with \(Cov(F_{c_1,t_1},F_{c_2,t_2}) = \left[ \textbf{K}^{time} \right]_{t_1,t_2} \left[ \textbf{H}^2 \right]_{c_1,c_2}\).

kgr_model1 = function(dataset, rho_time_rbf = 1, rho_time_periodic = 1, sigma2_time = 1, link=1){
  
  #Calculating gram matrix K_time
   K_time = time_kernel(time_span = length(unique(dataset$time)),rho_rbf = rho_time_rbf, 
                        rho_periodic = rho_time_periodic, sigma2 = sigma2_time)
  
  #Heatmap of resulting K 
  K_time_heatmap = matrix_heatmap(K_time,title = "K_time heatmap")
  
  #Calculate trace norm of gram matrix
  K_time_weight = norm((1/60)*K_time,type = "F")
  
  #Calculate proposed kernel
  covGP2 = kronecker(K_time/60,(H^2)/7)
  
  #Need to ensure precision matrix is not computationally singular i.e det > 0
  covGP_jittered = desingularize(covGP2,threshold = 1e-2,increment = 0.01)
  covGP2 = covGP_jittered[[1]]
  
  inv_covGP2 = solve(covGP2)
  
  #Heatmap of resulting inv_covGP2 
  inv_covGP2_heatmap = matrix_heatmap(inv_covGP2,title = "Precision matrix heatmap")
  
  ###Fit INLA model 
  # kgr_formula1 = response ~ -1 + Intercept1 + Intercept2 + f(id2,model = "generic0",Cmatrix = inv_covGP2)
  
  kgr_formula1 = response ~ -1 + months + Intercept1 + Intercept2 + f(id2,model = "generic0",Cmatrix = inv_covGP2)
  
  model = inla(formula = kgr_formula1,family = "poisson",data = dataset,
                  control.compute = list(dic=TRUE,waic=TRUE,
                                         return.marginals.predictor=TRUE),
                  control.inla = list(strategy = "laplace"),
                  control.predictor = list(compute = TRUE, link = link))
  
  ###Extract relevant information and store in the list
  model_summary <- model$summary.fixed
  bri_hyperpar_summary <- bri.hyperpar.summary(model)
  model_DIC <- model$dic$dic
  model_WAIC <- model$waic$waic
  preds_model <- model$summary.fitted.values
  preds_model <- cbind(dataset$id, dataset$time, preds_model)
  colnames(preds_model) <- c("id", "time", "mean", "sd", "0.025quant", "0.5quant", "0.975quant", "mode")
  marginal_fvs <- model$marginals.fitted.values
  
  #Exponentiating parameter to get better interpretation of estimates 
  multeff <- exp(model$summary.fixed$mean)
  names(multeff) <- model$names.fixed
  
  #Plot of each parameters' posterior density 
  mf <- melt(model$marginals.fixed)
  cf <- spread(mf,Var2,value)
  names(cf)[2] <- 'parameter'
  param_plot = ggplot(cf,aes(x=x,y=y)) + geom_line()+facet_wrap(~ parameter, 
             scales="free") + geom_vline(xintercept=0) + ylab("density")
  
  #Plot of precision of random effect (main hyperparameter of interest)
  sden <- data.frame(bri.hyper.sd(model$marginals.hyperpar[[1]]))
  hyperparam_plot = ggplot(sden,aes(x,y)) + geom_line() + ylab("density") + 
             xlab("linear predictor")
  
  #Store the results in the list
  kgr_model1_results = list(
    K_time_heatmap = K_time_heatmap,
    K_time_weight = K_time_weight/(K_time_weight + gfilter_weight),
    gfilter_weight = gfilter_weight/(K_time_weight + gfilter_weight),
    covmatrix = covGP2,
    prec = inv_covGP2,
    num_jitters = covGP_jittered[[2]],
    prec_heatmap = inv_covGP2_heatmap,
    model_summary = model_summary,
    bri_hyperpar_summary = bri_hyperpar_summary,
    exp_effects = multeff,
    param_plot = param_plot,
    hyperparam_plot = hyperparam_plot,
    model_DIC = model_DIC,
    model_WAIC = model_WAIC,
    fitted_values = preds_model,
    marg_fitted_values = marginal_fvs
  )
  
  return(kgr_model1_results)
}

#Fit kgr_model1
kgr_model1_fit = kgr_model1(dataset = inla_insample_data,rho_time_rbf = 36.653,
                            rho_time_periodic = 113.929,sigma2_time = 4.506)

#Extract DIC and WAIC
kgr_model1_DIC = kgr_model1_fit$model_DIC
kgr_model1_WAIC = kgr_model1_fit$model_WAIC

#Get summaries of parameter estimates
kgr_model1_fit$model_summary
##                 mean       sd 0.025quant  0.5quant 0.975quant      mode
## months1    1.2770017 8.451513  -15.29772 1.2770017   17.85172 1.2770017
## months2    1.0600572 8.451514  -15.51467 1.0600572   17.63478 1.0600572
## months3    1.0878848 8.451514  -15.48684 1.0878848   17.66261 1.0878848
## months4    0.8989463 8.451516  -15.67578 0.8989463   17.47367 0.8989463
## months5    0.8167251 8.451517  -15.75800 0.8167251   17.39145 0.8167251
## months6    0.6861147 8.451518  -15.88862 0.6861147   17.26085 0.6861147
## months7    0.6515848 8.451528  -15.92317 0.6515848   17.22634 0.6515848
## months8    0.6301173 8.451528  -15.94463 0.6301173   17.20487 0.6301173
## months9    0.5853395 8.451529  -15.98941 0.5853395   17.16009 0.5853395
## months10   0.6678850 8.451527  -15.90687 0.6678850   17.24264 0.6678850
## months11   0.7143592 8.451527  -15.86039 0.7143592   17.28911 0.7143592
## months12   0.9637071 8.451524  -15.61104 0.9637071   17.53845 0.9637071
## Intercept1 6.4000692 8.451494  -10.17462 6.4000692   22.97475 6.4000692
## Intercept2 3.6396535 8.451504  -12.93505 3.6396535   20.21436 3.6396535
##                     kld
## months1    5.527838e-11
## months2    5.527836e-11
## months3    5.527817e-11
## months4    5.527854e-11
## months5    5.527833e-11
## months6    5.527831e-11
## months7    5.527819e-11
## months8    5.527839e-11
## months9    5.527838e-11
## months10   5.527839e-11
## months11   5.527820e-11
## months12   5.527844e-11
## Intercept1 5.527858e-11
## Intercept2 5.527830e-11
kgr_model1_fit$bri_hyperpar_summary
##                 mean         sd   q0.025      q0.5    q0.975      mode
## SD for id2 0.5889307 0.06428241 0.474332 0.5847477 0.7266215 0.5770608
kgr_model1_fit$exp_effects
##    months1    months2    months3    months4    months5    months6    months7 
##   3.585872   2.886536   2.967990   2.457013   2.263076   1.985984   1.918579 
##    months8    months9   months10   months11   months12 Intercept1 Intercept2 
##   1.877831   1.795601   1.950108   2.042877   2.621396 601.886675  38.078640
kgr_model1_fit$K_time_weight
## [1] 0.9604883
kgr_model1_fit$gfilter_weight
## [1] 0.03951171
kgr_model1_fit$num_jitters
## [1] 1
#Show plots
kgr_model1_fit$K_time_heatmap

kgr_model1_fit$prec_heatmap

kgr_model1_fit$param_plot

kgr_model1_fit$hyperparam_plot

pp_insample_plot(pred_data = kgr_model1_fit$fitted_values)

Finally, we could also increase the complexity of our proposed model by including our time kernel in the covariance structure of the underlying GP. Notice that K does not explicitly have a temporal dependence structure; instead, it represents EPA covariate similarity compared across months. We can explicitly include the time kernel above by either element wise adding or multiplying K_EPA and K_time together before taking the kronecker product with \(H^2\)

Proposed model 3

\(\Lambda_{c,t} | \textbf{F}, \textbf{S} = \exp(\beta_{c1} I \{ c=1 \} + \beta_{c2} I \{ c=2 \} + \beta_1 I \{t \, mod \, 12 = 1 \} + ... + \beta_{11} I \{t \, mod \, 12 = 11 \} + \textbf{F}_{c,t})\)

where the graph signal \(\textbf{F} | \textbf{S}, \rho^{time}_{rbf}, \rho^{time}_{p}, \rho^{EPA}_{rbf}, \rho^{EPA}_{p}, \sigma^2 \sim \mathcal{GP}(\textbf{0},(\textbf{K}_{time} \odot \textbf{K}_{EPA}) \otimes \textbf{H}^2)\) with \(Cov(F_{c_1,t_1},F_{c_2,t_2}) = \left[ \textbf{K}^{time} \right]_{t_1,t_2} \left[ \textbf{K}^{EPA} \right]_{t_1,t_2} \left[ \textbf{H}^2 \right]_{c_1,c_2}\).

kgr_model3 = function(dataset,rho_EPA_rbf = 1,rho_EPA_periodic = 1,rho_time_rbf = 1,rho_time_periodic = 1,sigma2 = 1,link=1){
  
  ###Calculating gram matrix K_EPA
  K_EPA = EPA_kernel(time_span = length(unique(dataset$time)),
                     rho_rbf = rho_EPA_rbf,rho_periodic = rho_EPA_periodic,sigma2 = 1)
  
  #Heatmap of resulting K 
  K_EPA_heatmap = matrix_heatmap(K_EPA,title = "K_EPA heatmap")
  
  ###Calculating gram matrix K_time
 K_time = time_kernel(time_span = length(unique(dataset$time)),rho_rbf = rho_time_rbf, 
                      rho_periodic = rho_time_periodic, sigma2 = sigma2)
  
  #Heatmap of resulting K 
  K_time_heatmap = matrix_heatmap(K_time,title = "K_time heatmap")
  
  #Calculate trace norm of gram matrix
  gram = (K_EPA*K_time)/sigma2
  K_weight = norm((1/60)*gram,type = "F")
  
  
  ###Load graph regression kernel 
  covGP3 = kronecker(gram/60,(H^2)/7)
  
  #Need to ensure precision matrix is not computationally singular i.e det > 0
  covGP_jittered = desingularize(covGP3,threshold = 1e-2,increment = 0.01)
  covGP3 = covGP_jittered[[1]]
  
  inv_covGP3 = solve(covGP3)
  
  #Heatmap of resulting K 
  inv_covGP3_heatmap = matrix_heatmap(inv_covGP3,title = "Precision matrix heatmap")
  
  
  ###Fit INLA model 
  # kgr_formula3 = response ~ -1 + Intercept1 + Intercept2 + f(id2,model = "generic0",Cmatrix = inv_covGP3)
  
  kgr_formula3 = response ~ -1 + months + Intercept1 + Intercept2 + f(id2,model = "generic0",Cmatrix = inv_covGP3)
  
  model = inla(formula = kgr_formula3,family = "poisson",data = dataset,
                  control.compute = list(dic=TRUE,waic=TRUE,
                                         return.marginals.predictor=TRUE),
                  control.inla = list(strategy = "gaussian"),
                  control.predictor = list(compute = TRUE, link = link))
  
  ###Extract relevant information and store in the list
  model_summary <- model$summary.fixed
  bri_hyperpar_summary <- bri.hyperpar.summary(model)
  model_DIC <- model$dic$dic
  model_WAIC <- model$waic$waic
  preds_model <- model$summary.fitted.values
  preds_model <- cbind(dataset$id, dataset$time, preds_model)
  colnames(preds_model) <- c("id", "time", "mean", "sd", "0.025quant", "0.5quant", "0.975quant", "mode")
  marginal_fvs <- model$marginals.fitted.values
  
  #Exponentiating parameter to get better interpretation of estimates 
  multeff <- exp(model$summary.fixed$mean)
  names(multeff) <- model$names.fixed
  
  #Plot of each parameters' posterior density 
  mf <- melt(model$marginals.fixed)
  cf <- spread(mf,Var2,value)
  names(cf)[2] <- 'parameter'
  param_plot = ggplot(cf,aes(x=x,y=y)) + geom_line()+facet_wrap(~ parameter, 
             scales="free") + geom_vline(xintercept=0) + ylab("density")
  
  #Plot of precision of random effect (main hyperparameter of interest)
  sden <- data.frame(bri.hyper.sd(model$marginals.hyperpar[[1]]))
  hyperparam_plot = ggplot(sden,aes(x,y)) + geom_line() + ylab("density") + 
             xlab("linear predictor")
  
  #Store the results in the list
  kgr_model3_results = list(
    K_EPA_heatmap = K_EPA_heatmap,
    K_time_heatmap = K_time_heatmap,
    K_weight = K_weight/(K_weight + gfilter_weight),
    gfilter_weight = gfilter_weight/(K_weight + gfilter_weight),
    covmatrix = covGP3,
    prec = inv_covGP3,
    num_jitters = covGP_jittered[[2]],
    prec_heatmap = inv_covGP3_heatmap,
    model_summary = model_summary,
    bri_hyperpar_summary = bri_hyperpar_summary,
    exp_effects = multeff,
    param_plot = param_plot,
    hyperparam_plot = hyperparam_plot,
    model_DIC = model_DIC,
    model_WAIC = model_WAIC,
    fitted_values = preds_model,
    marg_fitted_values = marginal_fvs
  )
  
  return(kgr_model3_results)
}

#Fit kgr_model3
kgr_model3_fit = kgr_model3(dataset = inla_insample_data, rho_EPA_rbf = 763.758, rho_EPA_periodic = 533.184,
                            rho_time_rbf = 52.065, rho_time_periodic = 402.182, sigma2 = 1.538)

#Extract DIC and WAIC
kgr_model3_DIC = kgr_model3_fit$model_DIC
kgr_model3_WAIC = kgr_model3_fit$model_WAIC

#Get summaries of parameter estimates
kgr_model3_fit$model_summary
##                 mean       sd 0.025quant  0.5quant 0.975quant      mode
## months1    1.2761617 8.451515  -15.29856 1.2761617   17.85089 1.2761617
## months2    1.0600833 8.451517  -15.51465 1.0600833   17.63481 1.0600833
## months3    1.0884177 8.451516  -15.48631 1.0884177   17.66315 1.0884177
## months4    0.8994165 8.451518  -15.67532 0.8994165   17.47415 0.8994165
## months5    0.8171242 8.451519  -15.75761 0.8171242   17.39186 0.8171242
## months6    0.6852816 8.451521  -15.88946 0.6852816   17.26002 0.6852816
## months7    0.6532741 8.451531  -15.92148 0.6532741   17.22803 0.6532741
## months8    0.6317338 8.451531  -15.94302 0.6317338   17.20649 0.6317338
## months9    0.5851621 8.451532  -15.98960 0.5851621   17.15992 0.5851621
## months10   0.6677245 8.451530  -15.90703 0.6677245   17.24248 0.6677245
## months11   0.7136081 8.451530  -15.86115 0.7136081   17.28836 0.7136081
## months12   0.9615709 8.451527  -15.61318 0.9615709   17.53632 0.9615709
## Intercept1 6.3999054 8.451487  -10.17477 6.3999054   22.97458 6.3999054
## Intercept2 3.6396531 8.451497  -12.93504 3.6396531   20.21434 3.6396531
##                     kld
## months1    5.527836e-11
## months2    5.527834e-11
## months3    5.527853e-11
## months4    5.527831e-11
## months5    5.527830e-11
## months6    5.527828e-11
## months7    5.527815e-11
## months8    5.527835e-11
## months9    5.527834e-11
## months10   5.527835e-11
## months11   5.527836e-11
## months12   5.527840e-11
## Intercept1 5.527852e-11
## Intercept2 5.527839e-11
kgr_model3_fit$bri_hyperpar_summary
##                 mean         sd    q0.025      q0.5    q0.975      mode
## SD for id2 0.6240411 0.06742041 0.5038781 0.6196439 0.7684758 0.6115617
kgr_model3_fit$exp_effects
##    months1    months2    months3    months4    months5    months6    months7 
##   3.582861   2.886611   2.969572   2.458168   2.263980   1.984330   1.921823 
##    months8    months9   months10   months11   months12 Intercept1 Intercept2 
##   1.880869   1.795282   1.949796   2.041343   2.615802 601.788091  38.078624
kgr_model3_fit$K_weight
## [1] 0.8543652
kgr_model3_fit$gfilter_weight
## [1] 0.1456348
kgr_model3_fit$num_jitters
## [1] 1
#Show plots
kgr_model3_fit$K_time_heatmap

kgr_model3_fit$K_EPA_heatmap

kgr_model3_fit$prec_heatmap

kgr_model3_fit$param_plot

kgr_model3_fit$hyperparam_plot

pp_insample_plot(pred_data = kgr_model3_fit$fitted_values)

Proposed model 4

\(\Lambda_{c,t} | \textbf{F}, \textbf{S} = \exp(\beta_{c1} I \{ c=1 \} + \beta_{c2} I \{ c=2 \} + \beta_1 I \{t \, mod \, 12 = 1 \} + ... + \beta_{11} I \{t \, mod \, 12 = 11 \} + \textbf{F}_{c,t})\)

where the graph signal \(\textbf{F} | \textbf{S}, \rho^{time}_{rbf}, \rho^{time}_{p}, \rho^{EPA}_{rbf}, \rho^{EPA}_{p}, \sigma^2_{time}, \sigma^2_{EPA} \sim \mathcal{GP}(\textbf{0},(\frac{1}{2}(\textbf{K}_{time} + \textbf{K}_{EPA}) \otimes \textbf{H}^2)\) with \(Cov(F_{c_1,t_1},F_{c_2,t_2}) = \frac{1}{2} \left( \left[ \textbf{K}^{time} \right]_{t_1,t_2} + \left[ \textbf{K}^{EPA} \right]_{t_1,t_2} \right) \left[ \textbf{H}^2 \right]_{c_1,c_2}\).

kgr_model4 = function(dataset,rho_EPA_rbf = 1,rho_EPA_periodic = 1,rho_time_rbf = 1,
                      rho_time_periodic = 1,sigma2_EPA = 1,sigma2_time = 1,link = 1){
  
  ###Calculating gram matrix K_EPA
  K_EPA = EPA_kernel(time_span = length(unique(dataset$time)),
                     rho_rbf = rho_EPA_rbf,rho_periodic = rho_EPA_periodic,sigma2 = 1)
  
  #Heatmap of resulting K 
  K_EPA_heatmap = matrix_heatmap(K_EPA,title = "K_EPA heatmap")
  
  ###Calculating gram matrix K_time
  K_time = time_kernel(time_span = length(unique(dataset$time)),rho_rbf = rho_time_rbf, 
                    rho_periodic = rho_time_periodic, sigma2 = sigma2_time)
  
  #Heatmap of resulting K 
  K_time_heatmap = matrix_heatmap(K_time,title = "K_time heatmap")
  
  gram = 0.5*(K_time+K_EPA)
  K_weight = norm((1/60)*gram,type = "F")

  ###Load graph regression kernel 
  covGP4 = kronecker(gram/60,(H^2)/7)
  
  #Need to ensure precision matrix is not computationally singular i.e det > 0
  covGP_jittered = desingularize(covGP4,threshold = 1e-2,increment = 0.01)
  covGP4 = covGP_jittered[[1]]
  
  inv_covGP4 = solve(covGP4)
  
  #Heatmap of resulting K 
  inv_covGP4_heatmap = matrix_heatmap(inv_covGP4,title = "Precision matrix heatmap")
  
  
  ###Fit INLA model 
  # kgr_formula4 = response ~ -1 + Intercept1 + Intercept2 + f(id2,model = "generic0",Cmatrix = inv_covGP4)
  
  kgr_formula4 = response ~ -1 + months + Intercept1 + Intercept2 + f(id2,model = "generic0",Cmatrix = inv_covGP4)
  
  model = inla(formula = kgr_formula4,family = "poisson",data = dataset,
                  control.compute = list(dic=TRUE,waic=TRUE,
                                         return.marginals.predictor=TRUE),
                  control.inla = list(strategy = "laplace"),
                  control.predictor = list(compute = TRUE, link = link))
  
  ###Extract relevant information and store in the list
  model_summary <- model$summary.fixed
  bri_hyperpar_summary <- bri.hyperpar.summary(model)
  model_DIC <- model$dic$dic
  model_WAIC <- model$waic$waic
  preds_model <- model$summary.fitted.values
  preds_model <- cbind(dataset$id, dataset$time, preds_model)
  colnames(preds_model) <- c("id", "time", "mean", "sd", "0.025quant", "0.5quant", "0.975quant", "mode")
  marginal_fvs <- model$marginals.fitted.values
  
  #Exponentiating parameter to get better interpretation of estimates 
  multeff <- exp(model$summary.fixed$mean)
  names(multeff) <- model$names.fixed
  
  #Plot of each parameters' posterior density 
  mf <- melt(model$marginals.fixed)
  cf <- spread(mf,Var2,value)
  names(cf)[2] <- 'parameter'
  param_plot = ggplot(cf,aes(x=x,y=y)) + geom_line()+facet_wrap(~ parameter, 
             scales="free") + geom_vline(xintercept=0) + ylab("density")
  
  #Plot of precision of random effect (main hyperparameter of interest)
  sden <- data.frame(bri.hyper.sd(model$marginals.hyperpar[[1]]))
  hyperparam_plot = ggplot(sden,aes(x,y)) + geom_line() + ylab("density") + 
             xlab("linear predictor")
  
  #Store the results in the list
  kgr_model4_results = list(
    K_EPA_heatmap = K_EPA_heatmap,
    K_time_heatmap = K_time_heatmap,
    K_weight = K_weight/(K_weight + gfilter_weight),
    gfilter_weight = gfilter_weight/(K_weight + gfilter_weight),
    covmatrix = covGP4,
    prec = inv_covGP4,
    num_jitters = covGP_jittered[[2]],
    prec_heatmap = inv_covGP4_heatmap,
    model_summary = model_summary,
    bri_hyperpar_summary = bri_hyperpar_summary,
    exp_effects = multeff,
    param_plot = param_plot,
    hyperparam_plot = hyperparam_plot,
    model_DIC = model_DIC,
    model_WAIC = model_WAIC,
    fitted_values = preds_model,
    marg_fitted_values = marginal_fvs
  )
  
  return(kgr_model4_results)
}

#Fit kgr_model4
kgr_model4_fit = kgr_model4(dataset = inla_insample_data, rho_EPA_rbf = 27.258, rho_EPA_periodic = 7.175,
                            rho_time_rbf = 33.593, rho_time_periodic = 173.952, sigma2_EPA = 4.830, sigma2_time = 4.573)

#Extract DIC and WAIC 
kgr_model4_DIC = kgr_model4_fit$model_DIC
kgr_model4_WAIC = kgr_model4_fit$model_WAIC

#Get summaries of parameter estimates
kgr_model4_fit$model_summary
##                 mean       sd 0.025quant  0.5quant 0.975quant      mode
## months1    1.2772026 8.451514  -15.29752 1.2772026   17.85193 1.2772026
## months2    1.0593702 8.451516  -15.51536 1.0593702   17.63410 1.0593702
## months3    1.0877353 8.451515  -15.48699 1.0877353   17.66246 1.0877353
## months4    0.8983559 8.451517  -15.67638 0.8983559   17.47309 0.8983559
## months5    0.8175152 8.451518  -15.75722 0.8175152   17.39225 0.8175152
## months6    0.6854484 8.451520  -15.88929 0.6854484   17.26018 0.6854484
## months7    0.6524784 8.451529  -15.92228 0.6524784   17.22723 0.6524784
## months8    0.6306166 8.451530  -15.94414 0.6306166   17.20537 0.6306166
## months9    0.5855092 8.451530  -15.98925 0.5855092   17.16027 0.5855092
## months10   0.6671816 8.451529  -15.90757 0.6671816   17.24194 0.6671816
## months11   0.7145031 8.451528  -15.86025 0.7145031   17.28926 0.7145031
## months12   0.9634705 8.451525  -15.61128 0.9634705   17.53822 0.9634705
## Intercept1 6.3998658 8.451493  -10.17482 6.3998658   22.97455 6.3998658
## Intercept2 3.6395212 8.451503  -12.93518 3.6395212   20.21422 3.6395212
##                     kld
## months1    5.527837e-11
## months2    5.527815e-11
## months3    5.527835e-11
## months4    5.527832e-11
## months5    5.527831e-11
## months6    5.527829e-11
## months7    5.527837e-11
## months8    5.527836e-11
## months9    5.527836e-11
## months10   5.527817e-11
## months11   5.527858e-11
## months12   5.527862e-11
## Intercept1 5.527844e-11
## Intercept2 5.527850e-11
kgr_model4_fit$bri_hyperpar_summary
##                mean       sd   q0.025      q0.5    q0.975      mode
## SD for id2 0.604151 0.065465 0.487453 0.5998881 0.7443792 0.5920527
kgr_model4_fit$exp_effects
##    months1    months2    months3    months4    months5    months6    months7 
##   3.586593   2.884554   2.967546   2.455562   2.264865   1.984662   1.920294 
##    months8    months9   months10   months11   months12 Intercept1 Intercept2 
##   1.878769   1.795905   1.948737   2.043171   2.620776 601.764250  38.073604
kgr_model4_fit$K_weight
## [1] 0.940688
kgr_model4_fit$gfilter_weight
## [1] 0.05931198
kgr_model4_fit$num_jitters
## [1] 1
#Show plots
kgr_model4_fit$K_time_heatmap

kgr_model4_fit$K_EPA_heatmap

kgr_model4_fit$prec_heatmap

kgr_model4_fit$param_plot

kgr_model4_fit$hyperparam_plot

pp_insample_plot(pred_data = kgr_model4_fit$fitted_values)

Proposed model 5

\(\Lambda_{c,t} | \textbf{F}, \textbf{S} = \exp(\beta_{c1} I \{ c=1 \} + \beta_{c2} I \{ c=2 \} + \beta_1 I \{t \, mod \, 12 = 1 \} + ... + \beta_{11} I \{t \, mod \, 12 = 11 \} + \textbf{F}_{c,t})\)

where the graph signal \(\textbf{F} | \textbf{S}, \rho^{AR}_{rbf}, \rho^{AR}_{p}, \rho^{DL}_{rbf}, \rho^{DL}_{p}, \rho^{Int}_{rbf}, \rho^{Int}_{p}, \sigma^2_{AR}, \sigma^2_{DL} , \sigma^2_{Int} \sim \mathcal{GP}(\textbf{0},((\frac{1}{3} \textbf{K}_{AR} + \frac{1}{3} \textbf{K}_{DL} + \frac{1}{3} \textbf{K}_{Interaction}) \otimes \textbf{H}^2)\) with \(Cov[F_{n_1,t_1},F_{n_2,t_2}] = (k(t_1,t_2)+k(x_{t_1},x_{t_2}))(H^2)_{n_1,n_2}\).

kgr_model5 = function(dataset, rho_AR_rbf = 1, rho_AR_periodic = 1, rho_DL_rbf = 1, rho_DL_periodic = 1,
                      rho_int_rbf = 1, rho_int_periodic = 1, sigma2_AR = 1, sigma2_DL = 1, sigma2_int = 1, link=1){
  
  #Calculating gram matrix K_AR
  K_AR_cluster = list()
  K_AR_periodic_cluster = list()
  
  for (c in 1:num_clus){
    
    #Grab S_random data for cluster c
    cluster_data = decomposed_cluster_data[[c]]
    S_random_clus = cluster_data$S_random
    
    #Create a list to contain covariance matrix for each pollutant (8)
    K_AR_list = list()
    K_AR_periodic_list = list()
    
    time_span = nrow(S_random_clus)
    
    #Calculate a AR 1 covariance matrix for each pollutant and store in list
    for (i in 1:8){
      ts =  S_random_clus[,i]
    
      K_covariate = matrix(nrow=time_span,ncol=time_span)
      K_covariate_periodic = matrix(nrow=time_span,ncol=time_span)
      
      for(j in 1:time_span){
        for (k in 1:time_span){
          if (abs(j-k) <= 1){
            
            K_covariate[j,k] = exp(- ((ts[j] - ts[k])^2) #RBF kernel 
                                 / (2*rho_AR_rbf)) * sigma2_AR
            
            K_covariate_periodic[j,k] = exp(- ((ts[j] - ts[k])^2) #Locally periodic kernel 
                         / (2*rho_AR_rbf)) * exp(- (2*sin((abs(ts[j] - ts[k]))*3.14/12)^2)
                         / (rho_AR_periodic)) * sigma2_AR
          }
          else{
            K_covariate_periodic[j,k] = 0
            K_covariate[j,k] = 0
            }
        }
      }
      
      K_AR_list[[i]] = K_covariate
      K_AR_periodic_list[[i]] = K_covariate_periodic
    }
    
    names(K_AR_list) = colnames(S_random_clus)
    names(K_AR_periodic_list) = colnames(S_random_clus)
    
    #Add each pollutant's covariance matrix to get AR 1 matrix for each cluster
    K_AR = matrix(0,nrow=60,ncol=60)
    K_AR_periodic = matrix(0,nrow=60,ncol=60)
    
    for(i in 1:length(K_AR_periodic_list)){
      K_AR = K_AR + ((1/8)*K_AR_list[[i]])
      K_AR_periodic = K_AR_periodic + ((1/8)*K_AR_periodic_list[[i]])
    }
    
    K_AR_cluster[[c]] = K_AR
    K_AR_periodic_cluster[[c]] = K_AR_periodic
  }
  
  K_AR = matrix(0,nrow=60,ncol=60)
  K_AR_periodic = matrix(0,nrow=60,ncol=60)
  
  for(i in 1:num_clus){
    K_AR = K_AR + ((1/num_clus)*K_AR_cluster[[i]])
    K_AR_periodic = K_AR_periodic + ((1/num_clus)*K_AR_periodic_cluster[[i]])
  }
  
  #Heatmap of resulting K 
  # K_AR_heatmap = corrplot(K_AR, order = 'original', cl.pos = 'b', tl.pos = 'n',method = "color", col = COL1('YlOrRd',10),
  # title = "AR 1 Covariance Structure")
  # K_AR_heatmap = corrplot(K_AR_periodic, order = 'original', cl.pos = 'b', tl.pos = 'n',method = "color", col = COL1('YlOrRd',10), title = "Periodic AR 1 Covariance Structure")
  
  # K_AR_heatmap = matrix_heatmap(K_AR,title = "AR 1 Covariance Structure")
  K_AR_heatmap = matrix_heatmap(K_AR_periodic,title = "Periodic AR 1 Covariance Structure")

  ###Calculating gram matrix K_DL
  K_DL_cluster = list()
  K_DL_periodic_cluster = list()
  
  for (c in 1:num_clus){
    
    #Grab S_DL data for cluster c
    cluster_data = decomposed_cluster_data[[c]]
    S_DL_clus = cluster_data$S_DL
    
    #Create a list to store covariance matrix for each DL 
    K_DL_list = list()
    K_DL_periodic_list = list()
    
    dl_lags = c(3,6,12)
    tracker = 1
    
    for (i in dl_lags){
      
      K_DL = matrix(nrow=time_span,ncol=time_span)
      K_DL_periodic = matrix(nrow=time_span,ncol=time_span)
      
      #Calculate DL covariance matrix for specified lag   
      for(j in 1:nrow(S_DL_clus)){
        for (k in 1:nrow(S_DL_clus)){
          
          if ((abs(j-k) == 0) || (abs(j-k) == i)){
            
            K_DL[j,k] = exp(- (sum(S_DL_clus[j,] - S_DL_clus[k,])^2) / (2*rho_DL_rbf)) * sigma2_DL
            
            K_DL_periodic[j,k] = exp(- (sum(S_DL_clus[j,] - S_DL_clus[k,])^2)
                                 / (2*rho_DL_rbf)) * exp(- (2*sin(sum(abs(S_DL_clus[j,] - S_DL_clus[k,]))*3.14/12)^2)
                                 / (rho_DL_periodic)) * sigma2_DL
            
          } 
          else{
            K_DL_periodic[j,k] = 0
            K_DL[j,k] = 0
            }
        }
      }
      
      K_DL_list[[tracker]] = K_DL
      K_DL_periodic_list[[tracker]] = K_DL_periodic
      tracker = tracker+1
    }
    
    #Combine the 3 DL covariance matrices together
    K_DL = matrix(0,nrow=time_span,ncol=time_span)
    K_DL_periodic = matrix(0,nrow=time_span,ncol=time_span)
    
    for(i in 1:length(K_DL_periodic_list)){
      K_DL = K_DL + ((1/3)*K_DL_list[[i]])
      K_DL_periodic = K_DL_periodic + ((1/3)*K_DL_periodic_list[[i]])
    }
    
    #Store DL(3,6,12) covariance matrix for each cluster 
    K_DL_cluster[[c]] = K_DL
    K_DL_periodic_cluster[[c]] = K_DL
  }
  
  K_DL = matrix(0,nrow=time_span,ncol=time_span)
  K_DL_periodic = matrix(0,nrow=time_span,ncol=time_span)
  
  for(i in 1:num_clus){
    K_DL = K_DL + ((1/num_clus)*K_DL_cluster[[i]])
    K_DL_periodic = K_DL_periodic + ((1/num_clus)*K_DL_periodic_cluster[[i]])
  }
  
  #Heatmap of resulting K 
  # K_DL_heatmap = corrplot(K_DL, order = 'original', cl.pos = 'b', tl.pos = 'n',method = "color", col = COL1('YlOrRd',10),
  # title = "DL (3,6,12) Covariance Structure")
  # K_DL_heatmap = corrplot(K_DL_periodic, order = 'original', cl.pos = 'b', tl.pos = 'n',method = "color", col = COL1('YlOrRd',10), title = "Periodic DL (3,6,12) Covariance Structure")
  
  # K_DL_heatmap = matrix_heatmap(K_DL,title = "DL (3,6,12) Covariance Structure")
  K_DL_heatmap = matrix_heatmap(K_DL_periodic,title = "Periodic DL (3,6,12) Covariance Structure")
  
  ###Calculating gram matrix K_Interaction
  K_Interaction_cluster = list()
  K_Interaction_periodic_cluster = list()
  
  for (c in 1:num_clus){
    
    #Grab interaction pair data for cluster c
    cluster_data = decomposed_cluster_data[[c]]
    W2_clus = cluster_data$W2
    
    K_interaction_list = list()
    K_interaction_periodic_list = list()
    
    column_names = colnames(W2_clus)
    time_span = nrow(W2_clus)
    
    #Create sequence of indices corresponding to comparisons for real time and one lag interaction effects
    lag0_idx = seq(2,3601,by=61)
    lag1_idx = seq(1,3600,by=61)
    
    #Calculate a kernel for each interaction pair 
    for (a in 1:length(column_names)){
      interaction =  W2_clus[,a]
      
      #First calculate these two interaction kernels separately 
      K_int0 = matrix(nrow = 60,ncol = 60)
      K_int1 = matrix(nrow = 60,ncol = 60)
      
      K_int0_periodic = matrix(nrow = 60,ncol = 60)
      K_int1_periodic = matrix(nrow = 60,ncol = 60)
      
      for (i in 1:60){
        for (j in 1:60){
          
          #RBF kernels
          K_int0[i,j] = exp(- ((interaction[lag0_idx[i]] - interaction[lag0_idx[j]])^2)
                              / (2*rho_int_rbf)) * sigma2_int
    
          K_int1[i,j] = exp(- ((interaction[lag1_idx[i]] - interaction[lag1_idx[j]])^2)
                              / (2*rho_int_rbf)) * sigma2_int
          
          #Locally periodic kernels 
          K_int0_periodic[i,j] = exp(- ((interaction[lag0_idx[i]] - interaction[lag0_idx[j]])^2)
                          / (2*rho_int_rbf)) * 
            exp(- (2*sin((abs(interaction[lag0_idx[i]] - interaction[lag0_idx[j]]))*3.14/12)^2)
                          / (rho_int_periodic)) * sigma2_int
              
          K_int1_periodic[i,j] = exp(- ((interaction[lag1_idx[i]] - interaction[lag1_idx[j]])^2)
                               / (2*rho_int_rbf)) * 
            exp(- (2*sin((abs(interaction[lag1_idx[i]] - interaction[lag1_idx[j]]))*3.14/12)^2)
                          / (rho_int_periodic)) * sigma2_int
        }
      }
      
      #Combine real time and one lag interaction kernels together
      K_interaction = 0.5*K_int0 + 0.5*K_int1
      K_interaction_list[[a]] = K_interaction
      
      K_interaction_periodic = 0.5*K_int0_periodic + 0.5*K_int1_periodic
      K_interaction_periodic_list[[a]] = K_interaction_periodic
    }
    
    #Combine kernels for each interaction pair together
    K_interaction = matrix(0,nrow=60,ncol=60)
    K_interaction_periodic = matrix(0,nrow=60,ncol=60)
    
    for(i in 1:length(K_interaction_periodic_list)){
      K_interaction = K_interaction + ((1/length(K_interaction_list))*K_interaction_list[[i]])
      
      K_interaction_periodic = K_interaction_periodic + ((1/length(K_interaction_periodic_list))*K_interaction_periodic_list[[i]])
    }

    #Store final interaction kernel (for all pairs) for each cluster 
    K_Interaction_cluster[[c]] = K_interaction
    K_Interaction_periodic_cluster[[c]] = K_interaction_periodic
  }
  
  K_interaction = matrix(0,nrow=60,ncol=60)
  K_interaction_periodic = matrix(0,nrow=60,ncol=60)
  
  for(i in 1:num_clus){
    K_interaction = K_interaction + ((1/length(K_Interaction_cluster))*K_Interaction_cluster[[i]])
    
    K_interaction_periodic = K_interaction_periodic + ((1/length(K_Interaction_periodic_cluster))*K_Interaction_periodic_cluster[[i]])
  }
  
  #Heatmap of resulting K 
  # K_Interaction_heatmap = corrplot(K_interaction, order = 'original', cl.pos = 'b', tl.pos = 'n',method = "color", col = COL1('YlOrRd',10), title = "Interaction Covariance Structure")
  # K_Interaction_heatmap = corrplot(K_interaction_periodic, order = 'original', cl.pos = 'b', tl.pos = 'n',method = "color", col = COL1('YlOrRd',10), title = "Periodic Interaction Covariance Structure")
  
  # K_Interaction_heatmap = matrix_heatmap(K_interaction,title = "Interaction Covariance Structure")
  K_Interaction_heatmap = matrix_heatmap(K_interaction_periodic,title = "Periodic Interaction Covariance Structure")
  
  gram = (1/3)*(K_AR_periodic+K_DL_periodic+K_interaction_periodic)
  K_weight = norm((1/60)*gram,type = "F")
  
  ###Load graph regression kernel 
  covGP5 = kronecker(gram/60,(H^2)/7)
  
  #Need to ensure precision matrix is not computationally singular i.e det > 0
  covGP_jittered = desingularize(covGP5,threshold = 1e-2,increment = 0.01)
  covGP5 = covGP_jittered[[1]]
  
  inv_covGP5 = solve(covGP5)
  
  #Heatmap of resulting K 
  inv_covGP5_heatmap = matrix_heatmap(inv_covGP5,title = "Precision matrix heatmap")
  
  
  ###Fit INLA model 
  # kgr_formula5 = response ~ -1 + Intercept1 + Intercept2 + f(id2,model = "generic0",Cmatrix = inv_covGP5)
  
  kgr_formula5 = response ~ -1 + months + Intercept1 + Intercept2 + f(id2,model = "generic0",Cmatrix = inv_covGP5)
  
  model = inla(formula = kgr_formula5,family = "poisson",data = dataset,
                  control.compute = list(dic=TRUE,waic=TRUE,
                                         return.marginals.predictor=TRUE),
                  control.inla = list(strategy = "laplace"),
                  control.predictor = list(compute = TRUE, link = link))
  
  ###Extract relevant information and store in the list
  model_summary <- model$summary.fixed
  bri_hyperpar_summary <- bri.hyperpar.summary(model)
  model_DIC <- model$dic$dic
  model_WAIC <- model$waic$waic
  preds_model <- model$summary.fitted.values
  preds_model <- cbind(dataset$id, dataset$time, preds_model)
  colnames(preds_model) <- c("id", "time", "mean", "sd", "0.025quant", "0.5quant", "0.975quant", "mode")
  marginal_fvs <- model$marginals.fitted.values
  
  #Exponentiating parameter to get better interpretation of estimates 
  multeff <- exp(model$summary.fixed$mean)
  names(multeff) <- model$names.fixed
  
  #Plot of each parameters' posterior density 
  mf <- melt(model$marginals.fixed)
  cf <- spread(mf,Var2,value)
  names(cf)[2] <- 'parameter'
  param_plot = ggplot(cf,aes(x=x,y=y)) + geom_line()+facet_wrap(~ parameter, 
             scales="free") + geom_vline(xintercept=0) + ylab("density")
  
  #Plot of precision of random effect (main hyperparameter of interest)
  sden <- data.frame(bri.hyper.sd(model$marginals.hyperpar[[1]]))
  hyperparam_plot = ggplot(sden,aes(x,y)) + geom_line() + ylab("density") + 
             xlab("linear predictor")
  
  #Store the results in the list
  kgr_model5_results = list(
    K_AR_heatmap = K_AR_heatmap, 
    K_DL_heatmap = K_DL_heatmap,
    K_Interaction_heatmap = K_Interaction_heatmap,
    K_weight = K_weight/(K_weight + gfilter_weight),
    gfilter_weight = gfilter_weight/(K_weight + gfilter_weight),
    covmatrix = covGP5,
    prec = inv_covGP5,
    num_jitters = covGP_jittered[[2]],
    prec_heatmap = inv_covGP5_heatmap,
    model_summary = model_summary,
    bri_hyperpar_summary = bri_hyperpar_summary,
    exp_effects = multeff,
    param_plot = param_plot,
    hyperparam_plot = hyperparam_plot,
    model_DIC = model_DIC,
    model_WAIC = model_WAIC,
    fitted_values = preds_model,
    marg_fitted_values = marginal_fvs
  )
  
  return(kgr_model5_results)
}

#Fit kgr_model5
kgr_model5_fit = kgr_model5(dataset = inla_insample_data, rho_AR_rbf = 0.014, rho_AR_periodic = 0.010,
                            rho_DL_rbf = 0.006, rho_DL_periodic = 0.004, rho_int_rbf = 0.011,
                            rho_int_periodic = 0.013, sigma2_AR = 3.751, sigma2_DL = 4.269, sigma2_int = 4.979, link=1)

#Extract DIC and WAIC 
kgr_model5_DIC = kgr_model5_fit$model_DIC
kgr_model5_WAIC = kgr_model5_fit$model_WAIC

#Get summaries of parameter estimates
kgr_model5_fit$model_summary
##                 mean       sd 0.025quant  0.5quant 0.975quant      mode
## months1    1.2786744 8.451523  -15.29607 1.2786744   17.85342 1.2786744
## months2    1.0596551 8.451523  -15.51509 1.0596551   17.63440 1.0596551
## months3    1.0861771 8.451521  -15.48856 1.0861771   17.66092 1.0861771
## months4    0.8970728 8.451523  -15.67767 0.8970728   17.47181 0.8970728
## months5    0.8140420 8.451524  -15.76070 0.8140420   17.38878 0.8140420
## months6    0.6833944 8.451525  -15.89135 0.6833944   17.25814 0.6833944
## months7    0.6555029 8.451535  -15.91926 0.6555029   17.23027 0.6555029
## months8    0.6329792 8.451536  -15.94179 0.6329792   17.20775 0.6329792
## months9    0.5863157 8.451536  -15.98845 0.5863157   17.16108 0.5863157
## months10   0.6676500 8.451535  -15.90712 0.6676500   17.24242 0.6676500
## months11   0.7137477 8.451535  -15.86102 0.7137477   17.28851 0.7137477
## months12   0.9618791 8.451533  -15.61288 0.9618791   17.53664 0.9618791
## Intercept1 6.3986491 8.451491  -10.17603 6.3986491   22.97333 6.3986491
## Intercept2 3.6384413 8.451501  -12.93626 3.6384413   20.21314 3.6384413
##                     kld
## months1    5.527845e-11
## months2    5.527826e-11
## months3    5.527828e-11
## months4    5.527826e-11
## months5    5.527864e-11
## months6    5.527823e-11
## months7    5.527829e-11
## months8    5.527828e-11
## months9    5.527868e-11
## months10   5.527849e-11
## months11   5.527850e-11
## months12   5.527832e-11
## Intercept1 5.527841e-11
## Intercept2 5.527834e-11
kgr_model5_fit$bri_hyperpar_summary
##                 mean         sd    q0.025     q0.5    q0.975      mode
## SD for id2 0.5813778 0.06324084 0.4688161 0.577201 0.7170021 0.5695467
kgr_model5_fit$exp_effects
##    months1    months2    months3    months4    months5    months6    months7 
##   3.591875   2.885376   2.962925   2.452414   2.257012   1.980589   1.926111 
##    months8    months9   months10   months11   months12 Intercept1 Intercept2 
##   1.883213   1.797354   1.949650   2.041628   2.616609 601.032573  38.032508
kgr_model5_fit$K_AR_weight
## NULL
kgr_model5_fit$K_DL_weight
## NULL
kgr_model5_fit$K_Int_weight
## NULL
kgr_model5_fit$K_weight
## [1] 0.9365558
kgr_model5_fit$gfilter_weight
## [1] 0.06344417
kgr_model5_fit$num_jitters
## [1] 1
#Show plots
kgr_model5_fit$K_AR_heatmap

kgr_model5_fit$K_DL_heatmap

kgr_model5_fit$K_Interaction_heatmap

kgr_model5_fit$prec_heatmap

kgr_model5_fit$param_plot

kgr_model5_fit$hyperparam_plot

pp_insample_plot(pred_data = kgr_model5_fit$fitted_values)

Comparing DIC and WAIC between models

With INLA, we can obtain the deviance information criterion (DIC) and the widely appliciable (or Watanabe-Akaike) information criterion (WAIC) which have the following formulas:

\(DIC = \bar D + p_D\) where the first term is the posterior mean deviance i.e., a measure of fit \(\bar D = E_{\theta | y} [D(\theta)]\) and the second term is the effective number of parameters i.e. a measure of model complexity \(p_D = E_{\theta | y} [D(\theta)] - D(E_{\theta | y}[\theta]) = \bar D - D(\bar \theta)\)

where \(D(\theta) = -2 log(p(y | \theta))\)

\(WAIC = T_n + \frac{V_n}{n}\) where \(T_n = -\frac{1}{n} \sum_{i=1}^n log p^*(Y_i | w)\) and \(V_n = \sum_{i=1}^n \{ E_w[(log p(Y_i | w))^2] - E_w[log p(Y_i | w)]^2 \}\)

where \(T_n\) is the log loss function and \(w\) is are the parameters in our model.

Also note that for both criteria, the smaller the value, the better the model

infocrit_table = matrix(nrow = 8,ncol = 2)

dics = c(ref_model1_DIC,ref_model2_DIC,kgr_model1_DIC,
         kgr_model2_DIC,kgr_model3_DIC,kgr_model4_DIC,kgr_model5_DIC)

waics = c(ref_model1_WAIC,ref_model2_WAIC,kgr_model1_WAIC,
          kgr_model2_WAIC,kgr_model3_WAIC,kgr_model4_WAIC,kgr_model5_WAIC)

infocrit_table = cbind(dics,waics)
colnames(infocrit_table) = c("DIC","WAIC")
rownames(infocrit_table) = c("Poisson GLM model","BYM model",
                             "Proposed KGR model 1","Proposed KGR model 2",
                             "Proposed KGR model 3","Proposed KGR model 4",
                             "Proposed KGR model 5")

infocrit_table = data.frame(infocrit_table)
infocrit_table
##                           DIC      WAIC
## Poisson GLM model    834.7084 1881.9626
## BYM model            834.7084 1881.9626
## Proposed KGR model 1 991.6708  977.5241
## Proposed KGR model 2 996.6220  983.6657
## Proposed KGR model 3 994.4733  980.2682
## Proposed KGR model 4 992.5576  978.1019
## Proposed KGR model 5 994.4876  980.4944

Comparing in sample RMSE for different clusters between models

One way to compare performance between the models fit above is to calculate RMSEs for each model’s fit on each cluster’s time series. Since INLA makes predictions based on the posterior predictive distribution, I actually calculated two sets of RMSEs. The first one is the RMSE of the predictions made by each model on the observed training data points i.e. not months 55-60. These were the observations that the models were fit on so we would expect small discrepancies between the observed values and the posterior predictive means for those time periods. The second one is the RMSE of the predictions made by each model on the test data points i.e. months 55-60. There was a lot more variation in the RMSEs calculated for these data points obviously.

Another important thing to note here is that the RMSEs calculated for each cluster were drastically different because the population sizes between clusters varied by a lot (think thousands compared to hundred thousands). So in order to make actual comparisons, the RMSEs had to be scaled which involves dividing the calculated RMSE by the average of the actual observed data points. The resulting RMSE values for each cluster, which are presented in tables below, can now be interpreted relative to the average number of respiratory related deaths in that cluster.

#Overall fit
RMSE_table = matrix(nrow=7,ncol=num_clus)

for (i in 1:num_clus){
  actual = inla_insample_data %>% filter(id == i) %>% select(response) %>% data.frame()
  
  actual.mean = mean(actual$response)

  pm_1 = ref_model2_fit$fitted_values %>% filter(id == i) %>% select(mean,sd) %>% data.frame()
  pm_2 = ref_model3_fvs %>% filter(id == i) %>% select(mean,sd) %>% data.frame()
  pm_3 = kgr_model1_fit$fitted_values %>% filter(id == i) %>% select(mean,sd) %>% data.frame()
  pm_4 = kgr_model2_fit$fitted_values %>% filter(id == i) %>% select(mean,sd) %>% data.frame()
  pm_5 = kgr_model3_fit$fitted_values %>% filter(id == i) %>% select(mean,sd) %>% data.frame()
  pm_6 = kgr_model4_fit$fitted_values %>% filter(id == i) %>% select(mean,sd) %>% data.frame()
  pm_7 = kgr_model5_fit$fitted_values %>% filter(id == i) %>% select(mean,sd) %>% data.frame()
  
  rmse1 = sqrt(mean((actual.mean - pm_1$mean)^2))
  rmse2 = sqrt(mean((actual.mean - pm_2$mean)^2))
  rmse3 = sqrt(mean((actual.mean - pm_3$mean)^2))
  rmse4 = sqrt(mean((actual.mean - pm_4$mean)^2))
  rmse5 = sqrt(mean((actual.mean - pm_5$mean)^2))
  rmse6 = sqrt(mean((actual.mean - pm_6$mean)^2))
  rmse7 = sqrt(mean((actual.mean - pm_7$mean)^2))

  RMSE_table[,i] = c(rmse1,rmse2,rmse3,rmse4,rmse5,rmse6,rmse7) 
  RMSE_table[,i] = RMSE_table[,i] / actual.mean
}

#Table 1: In sample RMSE
RMSE_table = data.frame(RMSE_table)

colnames(RMSE_table) = c("Cluster 1","Cluster 2")
rownames(RMSE_table) = c("BYM model","LGCP model","Proposed KGR model 1",
                         "Proposed KGR model 2","Proposed KGR model 3",
                         "Proposed KGR model 4","Proposed KGR model 5")

RMSE_table
##                      Cluster 1 Cluster 2
## BYM model            0.2293650 0.2293866
## LGCP model           0.2433644 0.2123997
## Proposed KGR model 1 0.2434755 0.2280856
## Proposed KGR model 2 0.2434017 0.2266414
## Proposed KGR model 3 0.2434647 0.2269845
## Proposed KGR model 4 0.2434840 0.2276149
## Proposed KGR model 5 0.2434812 0.2271821

OUT OF SAMPLE FITTING (FORECASTING) ANALYSIS

For out of sample model fitting, we now use inla_outsample_data which has t=60 now instead of t=54. Recall that the response values for t=55,…,60 are NA in order for INLA to make posterior predictive predictions.

Plots of true mortality values

true_mortality = inla_full_data
true_mortality$time = as.numeric(true_mortality$time)

#Combine plots with library patchwork
true1 = true_mortality %>% filter(id == 1) %>% ggplot(aes(x=time,y=response)) + geom_line()
true2 = true_mortality %>% filter(id == 2) %>% ggplot(aes(x=time,y=response)) + geom_line() 

true1 + true2

#Write a function to make plot of posterior predictive estimates with credible interval bands OVERLAID on response
pp_outsample_plot = function(num_plots = num_clus, ref_data = inla_full_data, pred_data){
  for (i in 1:num_plots){
  df = ref_data %>% filter(id == i) %>% select(response)
  preds = pred_data %>% filter(id == i) 
  df = cbind(df,preds)
  
  # title = sprintf("Posterior Predictive Fits for Cluster %s",i)
  title = sprintf("Cluster %s",i)

    
  post_pred_plot = df %>% ggplot(aes(x=time,y=response)) + geom_point() + 
    geom_line(aes(y=mean),color = "red") + geom_ribbon(aes(ymin = `0.025quant`,ymax = `0.975quant`),alpha = 0.3) + geom_vline(xintercept = 54,linetype = "dashed",color = "blue",linewidth = 1.5) + ggtitle(title)
  print(post_pred_plot)
  }
}

Reference model 1

#Run model 
ref_model1_outfit = ref_model1(inla_outsample_data, a_prior=1, b_prior=1)

#Extract DIC and WAIC
ref_model1_DIC = ref_model1_outfit$model_DIC
ref_model1_WAIC = ref_model1_outfit$model_WAIC

#Get summaries of parameter estimates
ref_model1_outfit$model_summary
##                 mean       sd 0.025quant  0.5quant 0.975quant      mode
## months1    1.2945858 8.453839  -15.28473 1.2945950   17.87385 1.2946133
## months2    1.0524633 8.453841  -15.52686 1.0524725   17.63173 1.0524907
## months3    1.0718041 8.453840  -15.50752 1.0718134   17.65107 1.0718316
## months4    0.8836125 8.453842  -15.69571 0.8836217   17.46288 0.8836399
## months5    0.8029916 8.453842  -15.77633 0.8030008   17.38226 0.8030190
## months6    0.6875554 8.453843  -15.89177 0.6875646   17.26683 0.6875829
## months7    0.6587815 8.453845  -15.92055 0.6587908   17.23806 0.6588090
## months8    0.6250464 8.453846  -15.95428 0.6250556   17.20432 0.6250738
## months9    0.5914244 8.453846  -15.98791 0.5914336   17.17070 0.5914519
## months10   0.6629946 8.453845  -15.91633 0.6630038   17.24227 0.6630221
## months11   0.7068630 8.453845  -15.87246 0.7068722   17.28614 0.7068905
## months12   0.9720679 8.453843  -15.60726 0.9720771   17.55134 0.9720953
## Intercept1 6.3807998 8.647439  -10.58939 6.3839416   23.33300 6.3877642
## Intercept2 3.6294960 8.647225  -13.33599 3.6311291   20.58563 3.6331144
##                     kld
## months1    5.509915e-11
## months2    5.509913e-11
## months3    5.509914e-11
## months4    5.509912e-11
## months5    5.509911e-11
## months6    5.509910e-11
## months7    5.509927e-11
## months8    5.509927e-11
## months9    5.509926e-11
## months10   5.509907e-11
## months11   5.509928e-11
## months12   5.509931e-11
## Intercept1 1.447476e-08
## Intercept2 1.385493e-08
ref_model1_outfit$bri_hyperpar_summary
##               mean       sd    q0.025     q0.5   q0.975      mode
## SD for id 1.621106 1.375076 0.5168023 1.191234 5.612557 0.8129476
ref_model1_outfit$exp_effects
##    months1    months2    months3    months4    months5    months6    months7 
##   3.649484   2.864699   2.920644   2.419625   2.232209   1.988848   1.932436 
##    months8    months9   months10   months11   months12 Intercept1 Intercept2 
##   1.868333   1.806560   1.940595   2.027621   2.643405 590.399717  37.693813
#Show plots
ref_model1_outfit$param_plot

ref_model1_outfit$hyperparam_plot

pp_outsample_plot(pred_data = ref_model1_outfit$fitted_values)

Reference model 2

#Run model 
ref_model2_outfit = ref_model2(dataset = inla_outsample_data,a_prec_prior = 1,b_prec_prior = 1e-5,
                            a_phi_prior = 1,b_phi_prior = 1)

#Extract DIC and WAIC
ref_model2_DIC = ref_model2_outfit$model_DIC
ref_model2_WAIC = ref_model2_outfit$model_WAIC

#Get summaries of parameter estimates
ref_model2_outfit$model_summary
##                 mean       sd 0.025quant  0.5quant 0.975quant      mode
## months1    1.2973863 8.451485  -15.27728 1.2973863   17.87205 1.2973863
## months2    1.0552637 8.451486  -15.51941 1.0552637   17.62993 1.0552637
## months3    1.0746046 8.451486  -15.50006 1.0746046   17.64927 1.0746046
## months4    0.8864129 8.451487  -15.68826 0.8864129   17.46108 0.8864129
## months5    0.8057920 8.451487  -15.76888 0.8057920   17.38046 0.8057920
## months6    0.6903558 8.451488  -15.88432 0.6903558   17.26503 0.6903558
## months7    0.6615820 8.451491  -15.91310 0.6615820   17.23626 0.6615820
## months8    0.6278468 8.451491  -15.94683 0.6278468   17.20253 0.6278468
## months9    0.5942249 8.451491  -15.98045 0.5942249   17.16890 0.5942249
## months10   0.6657951 8.451491  -15.90888 0.6657951   17.24047 0.6657951
## months11   0.7096635 8.451490  -15.86501 0.7096635   17.28434 0.7096635
## months12   0.9748683 8.451488  -15.59980 0.9748683   17.54954 0.9748683
## Intercept1 6.4000847 8.510765  -10.29357 6.4002619   23.09269 6.4005551
## Intercept2 3.6437112 8.510774  -13.04891 3.6435340   20.33738 3.6432407
##                     kld
## months1    5.527835e-11
## months2    5.527834e-11
## months3    5.527834e-11
## months4    5.527832e-11
## months5    5.527832e-11
## months6    5.527811e-11
## months7    5.527848e-11
## months8    5.527847e-11
## months9    5.527847e-11
## months10   5.527848e-11
## months11   5.527848e-11
## months12   5.527831e-11
## Intercept1 3.816438e-13
## Intercept2 3.816062e-13
ref_model2_outfit$bri_hyperpar_summary
##                                     mean          sd      q0.025        q0.5
## SD for id (idd component)     0.00471893 0.002874426 0.001729254 0.003887535
## SD for id (spatial component) 1.49053635 0.907472727 0.546600800 1.228066604
##                                   q0.975       mode
## SD for id (idd component)     0.01248447 0.00281595
## SD for id (spatial component) 3.94213501 0.89026127
ref_model2_outfit$exp_effects
##    months1    months2    months3    months4    months5    months6    months7 
##   3.659719   2.872733   2.928835   2.426410   2.238469   1.994425   1.937856 
##    months8    months9   months10   months11   months12 Intercept1 Intercept2 
##   1.873572   1.811626   1.946037   2.033307   2.650818 601.895997  38.233464
#Show plots
ref_model2_outfit$param_plot

ref_model2_outfit$hyperparam_plot

pp_outsample_plot(pred_data = ref_model2_outfit$fitted_values)

Fitting kernel graph regression models

Proposed model 2

#Run model 
kgr_model2_outfit = kgr_model2(data = inla_outsample_data, rho_EPA_rbf = 125.474,
                            rho_EPA_periodic = 2.066, sigma2_EPA = 4.928)

#Extract DIC and WAIC
kgr_model2_DIC = kgr_model2_outfit$model_DIC
kgr_model2_WAIC = kgr_model2_outfit$model_WAIC

#Get summaries of parameter estimates
kgr_model2_outfit$model_summary
##                 mean       sd 0.025quant  0.5quant 0.975quant      mode
## months1    1.2812425 8.451518  -15.29349 1.2812425   17.85597 1.2812425
## months2    1.0498707 8.451520  -15.52486 1.0498707   17.62461 1.0498707
## months3    1.0866064 8.451517  -15.48812 1.0866064   17.66134 1.0866064
## months4    0.8859014 8.451524  -15.68884 0.8859014   17.46064 0.8859014
## months5    0.8306487 8.451522  -15.74409 0.8306487   17.40539 0.8306487
## months6    0.6889261 8.451526  -15.88582 0.6889261   17.26367 0.6889261
## months7    0.6581392 8.451531  -15.91662 0.6581392   17.23290 0.6581392
## months8    0.6302221 8.451539  -15.94455 0.6302221   17.20499 0.6302221
## months9    0.5874582 8.451533  -15.98730 0.5874582   17.16222 0.5874582
## months10   0.6607398 8.451536  -15.91403 0.6607398   17.23551 0.6607398
## months11   0.7188363 8.451532  -15.85592 0.7188363   17.29360 0.7188363
## months12   0.9632670 8.451531  -15.61149 0.9632670   17.53802 0.9632670
## Intercept1 6.4010602 8.451502  -10.17364 6.4010602   22.97576 6.4010602
## Intercept2 3.6407984 8.451511  -12.93392 3.6407984   20.21552 3.6407984
##                     kld
## months1    5.527851e-11
## months2    5.527829e-11
## months3    5.527832e-11
## months4    5.527824e-11
## months5    5.527847e-11
## months6    5.527841e-11
## months7    5.527834e-11
## months8    5.527864e-11
## months9    5.527831e-11
## months10   5.527828e-11
## months11   5.527833e-11
## months12   5.527835e-11
## Intercept1 5.527833e-11
## Intercept2 5.527841e-11
kgr_model2_outfit$bri_hyperpar_summary
##                 mean         sd    q0.025      q0.5    q0.975      mode
## SD for id2 0.4540322 0.04949596 0.3658247 0.4508022 0.5600683 0.4448736
kgr_model2_outfit$exp_effects
##    months1    months2    months3    months4    months5    months6    months7 
##   3.601111   2.857282   2.964198   2.425170   2.294807   1.991576   1.931196 
##    months8    months9   months10   months11   months12 Intercept1 Intercept2 
##   1.878028   1.799409   1.936224   2.052044   2.620243 602.483423  38.122262
kgr_model2_outfit$K_EPA_weight
## [1] 0.9792925
kgr_model2_outfit$gfilter_weight
## [1] 0.02070745
#Show plots
kgr_model2_outfit$K_EPA_heatmap

kgr_model2_outfit$param_plot

kgr_model2_outfit$hyperparam_plot

pp_outsample_plot(pred_data = kgr_model2_outfit$fitted_values)

Reference model 3

#Fit ref_model3 on one cluster (to test)
ref_model3_outfit = ref_model3(dataset = inla_outsample_data, cluster = 2, rho_time_rbf = 1,
                   rho_time_periodic = 1, sigma2_time = 5)

#Extract DIC and WAIC
ref_model3_DIC = ref_model3_outfit$model_DIC
ref_model3_WAIC = ref_model3_outfit$model_WAIC

#Get summaries of parameter estimates
ref_model3_outfit$model_summary
##                    mean          sd 0.025quant      0.5quant 0.975quant
## Intercept1 6.204599e-15 31.62254149 -62.016688 -4.615441e-15  62.016688
## Intercept2 4.496428e+00  0.04277114   4.411667  4.496590e+00   4.580278
##                    mode          kld
## Intercept1 3.408459e-23 5.527836e-11
## Intercept2 4.496913e+00 9.965076e-09
ref_model3_outfit$bri_hyperpar_summary
##                   mean         sd     q0.025       q0.5   q0.975       mode
## SD for time 0.08875563 0.01141065 0.06879227 0.08788005 0.113584 0.08622776
ref_model3_outfit$exp_effects
## Intercept1 Intercept2 
##    1.00000   89.69616
ref_model3_outfit$K_time_weight
## [1] 0.9985287
ref_model3_outfit$gfilter_weight
## [1] 0.001471297
#Show plots
ref_model3_outfit$K_time_heatmap

ref_model3_outfit$param_plot

ref_model3_outfit$hyperparam_plot

test1 = ref_model3(dataset = inla_outsample_data, cluster = 1, rho_time_rbf = 207.105,
                   rho_time_periodic = 5.302, sigma2_time = 2.879)
test2 = ref_model3(dataset = inla_outsample_data, cluster = 2, rho_time_rbf = 16.827,
                   rho_time_periodic = 212.333, sigma2_time = 1.053)

ref_model3_outfvs = rbind(test1$fitted_values,test2$fitted_values)

pp_outsample_plot(num_plots = num_clus,ref_data = inla_full_data,pred_data = ref_model3_outfvs)

Proposed model 1

#Fit kgr_model1
kgr_model1_outfit = kgr_model1(dataset = inla_outsample_data,rho_time_rbf = 36.653,
                            rho_time_periodic = 113.929,sigma2_time = 4.506)

#Extract DIC and WAIC
kgr_model1_DIC = kgr_model1_outfit$model_DIC
kgr_model1_WAIC = kgr_model1_outfit$model_WAIC

#Get summaries of parameter estimates
kgr_model1_outfit$model_summary
##                 mean       sd 0.025quant  0.5quant 0.975quant      mode
## months1    1.2770073 8.451513  -15.29771 1.2770073   17.85173 1.2770073
## months2    1.0600472 8.451514  -15.51468 1.0600472   17.63477 1.0600472
## months3    1.0878969 8.451514  -15.48683 1.0878969   17.66262 1.0878969
## months4    0.8989396 8.451516  -15.67579 0.8989396   17.47367 0.8989396
## months5    0.8167269 8.451517  -15.75800 0.8167269   17.39146 0.8167269
## months6    0.6861153 8.451518  -15.88862 0.6861153   17.26085 0.6861153
## months7    0.6515925 8.451528  -15.92316 0.6515925   17.22634 0.6515925
## months8    0.6301060 8.451528  -15.94465 0.6301060   17.20486 0.6301060
## months9    0.5853602 8.451529  -15.98939 0.5853602   17.16011 0.5853602
## months10   0.6678766 8.451527  -15.90687 0.6678766   17.24263 0.6678766
## months11   0.7143621 8.451527  -15.86039 0.7143621   17.28911 0.7143621
## months12   0.9637069 8.451524  -15.61104 0.9637069   17.53845 0.9637069
## Intercept1 6.4000724 8.451494  -10.17461 6.4000724   22.97476 6.4000724
## Intercept2 3.6396651 8.451504  -12.93504 3.6396651   20.21437 3.6396651
##                     kld
## months1    5.527858e-11
## months2    5.527836e-11
## months3    5.527837e-11
## months4    5.527834e-11
## months5    5.527853e-11
## months6    5.527811e-11
## months7    5.527839e-11
## months8    5.527859e-11
## months9    5.527838e-11
## months10   5.527839e-11
## months11   5.527840e-11
## months12   5.527824e-11
## Intercept1 5.527843e-11
## Intercept2 5.527832e-11
kgr_model1_outfit$bri_hyperpar_summary
##                 mean         sd   q0.025      q0.5    q0.975     mode
## SD for id2 0.5889307 0.06428241 0.474332 0.5847477 0.7266215 0.577061
kgr_model1_outfit$exp_effects
##    months1    months2    months3    months4    months5    months6    months7 
##   3.585892   2.886507   2.968025   2.456996   2.263080   1.985986   1.918594 
##    months8    months9   months10   months11   months12 Intercept1 Intercept2 
##   1.877810   1.795638   1.950092   2.042883   2.621396 601.888643  38.079084
kgr_model1_outfit$K_time_weight
## [1] 0.9625598
kgr_model1_outfit$gfilter_weight
## [1] 0.03744022
#Show plots
kgr_model1_outfit$K_time_heatmap

kgr_model1_outfit$param_plot

kgr_model1_outfit$hyperparam_plot

pp_outsample_plot(pred_data = kgr_model1_outfit$fitted_values)

Proposed model 3

#Fit kgr_model3
kgr_model3_outfit = kgr_model3(dataset = inla_outsample_data, rho_EPA_rbf = 763.758, rho_EPA_periodic = 533.184,
                            rho_time_rbf = 52.065, rho_time_periodic = 402.182, sigma2 = 1.538, link=1)

#Extract DIC and WAIC
kgr_model3_DIC = kgr_model3_outfit$model_DIC
kgr_model3_WAIC = kgr_model3_outfit$model_WAIC

#Get summaries of parameter estimates
kgr_model3_outfit$model_summary
##                 mean       sd 0.025quant  0.5quant 0.975quant      mode
## months1    1.2761647 8.451515  -15.29856 1.2761647   17.85089 1.2761647
## months2    1.0600778 8.451517  -15.51465 1.0600778   17.63481 1.0600778
## months3    1.0884228 8.451516  -15.48631 1.0884228   17.66315 1.0884228
## months4    0.8994129 8.451518  -15.67532 0.8994129   17.47415 0.8994129
## months5    0.8171253 8.451519  -15.75761 0.8171253   17.39186 0.8171253
## months6    0.6852797 8.451521  -15.88946 0.6852797   17.26002 0.6852797
## months7    0.6532784 8.451531  -15.92148 0.6532784   17.22804 0.6532784
## months8    0.6317258 8.451531  -15.94303 0.6317258   17.20648 0.6317258
## months9    0.5851745 8.451532  -15.98958 0.5851745   17.15993 0.5851745
## months10   0.6677186 8.451530  -15.90704 0.6677186   17.24247 0.6677186
## months11   0.7136095 8.451530  -15.86115 0.7136095   17.28836 0.7136095
## months12   0.9615687 8.451527  -15.61318 0.9615687   17.53632 0.9615687
## Intercept1 6.3999031 8.451487  -10.17477 6.3999031   22.97457 6.3999031
## Intercept2 3.6396556 8.451497  -12.93504 3.6396556   20.21435 3.6396556
##                     kld
## months1    5.527836e-11
## months2    5.527853e-11
## months3    5.527834e-11
## months4    5.527831e-11
## months5    5.527850e-11
## months6    5.527848e-11
## months7    5.527835e-11
## months8    5.527835e-11
## months9    5.527834e-11
## months10   5.527835e-11
## months11   5.527816e-11
## months12   5.527860e-11
## Intercept1 5.527832e-11
## Intercept2 5.527840e-11
kgr_model3_outfit$bri_hyperpar_summary
##                 mean         sd    q0.025      q0.5    q0.975      mode
## SD for id2 0.6240411 0.06742041 0.5038781 0.6196438 0.7684758 0.6115616
kgr_model3_outfit$exp_effects
##    months1    months2    months3    months4    months5    months6    months7 
##   3.582872   2.886595   2.969587   2.458160   2.263982   1.984327   1.921831 
##    months8    months9   months10   months11   months12 Intercept1 Intercept2 
##   1.880854   1.795304   1.949784   2.041346   2.615797 601.786704  38.078720
kgr_model3_outfit$K_weight
## [1] 0.8612827
kgr_model3_outfit$gfilter_weight
## [1] 0.1387173
#Show plots
kgr_model3_outfit$K_time_heatmap

kgr_model3_outfit$K_EPA_heatmap

kgr_model3_outfit$param_plot

kgr_model3_outfit$hyperparam_plot

pp_outsample_plot(pred_data = kgr_model3_outfit$fitted_values)

Proposed model 4

#Fit kgr_model4
kgr_model4_outfit = kgr_model4(dataset = inla_outsample_data, rho_EPA_rbf = 27.258, rho_EPA_periodic = 7.175,
                            rho_time_rbf = 33.593, rho_time_periodic = 173.952, sigma2_EPA = 4.830, sigma2_time = 4.573)

#Extract DIC and WAIC 
kgr_model4_DIC = kgr_model4_outfit$model_DIC
kgr_model4_WAIC = kgr_model4_outfit$model_WAIC

#Get summaries of parameter estimates
kgr_model4_outfit$model_summary
##                 mean       sd 0.025quant  0.5quant 0.975quant      mode
## months1    1.2772086 8.451514  -15.29752 1.2772086   17.85193 1.2772086
## months2    1.0593596 8.451516  -15.51537 1.0593596   17.63409 1.0593596
## months3    1.0877484 8.451515  -15.48698 1.0877484   17.66248 1.0877484
## months4    0.8983490 8.451517  -15.67638 0.8983490   17.47308 0.8983490
## months5    0.8175168 8.451518  -15.75722 0.8175168   17.39225 0.8175168
## months6    0.6854489 8.451520  -15.88929 0.6854489   17.26018 0.6854489
## months7    0.6524863 8.451529  -15.92227 0.6524863   17.22724 0.6524863
## months8    0.6306048 8.451530  -15.94415 0.6306048   17.20536 0.6306048
## months9    0.5855315 8.451530  -15.98922 0.5855315   17.16029 0.5855315
## months10   0.6671735 8.451529  -15.90758 0.6671735   17.24193 0.6671735
## months11   0.7145058 8.451528  -15.86025 0.7145058   17.28926 0.7145058
## months12   0.9634714 8.451525  -15.61128 0.9634714   17.53822 0.9634714
## Intercept1 6.3998703 8.451493  -10.17481 6.3998703   22.97455 6.3998703
## Intercept2 3.6395343 8.451503  -12.93517 3.6395343   20.21424 3.6395343
##                     kld
## months1    5.527817e-11
## months2    5.527855e-11
## months3    5.527815e-11
## months4    5.527852e-11
## months5    5.527832e-11
## months6    5.527829e-11
## months7    5.527857e-11
## months8    5.527836e-11
## months9    5.527836e-11
## months10   5.527837e-11
## months11   5.527838e-11
## months12   5.527842e-11
## Intercept1 5.527849e-11
## Intercept2 5.527852e-11
kgr_model4_outfit$bri_hyperpar_summary
##                mean       sd   q0.025      q0.5    q0.975      mode
## SD for id2 0.604151 0.065465 0.487453 0.5998881 0.7443792 0.5920526
kgr_model4_outfit$exp_effects
##    months1    months2    months3    months4    months5    months6    months7 
##   3.586614   2.884523   2.967585   2.455546   2.264869   1.984663   1.920309 
##    months8    months9   months10   months11   months12 Intercept1 Intercept2 
##   1.878746   1.795945   1.948721   2.043177   2.620778 601.766966  38.074103
kgr_model4_outfit$K_weight
## [1] 0.9440546
kgr_model4_outfit$gfilter_weight
## [1] 0.05594535
#Show plots
kgr_model4_outfit$K_time_heatmap

kgr_model4_outfit$K_EPA_heatmap

kgr_model4_outfit$param_plot

kgr_model4_outfit$hyperparam_plot

pp_outsample_plot(pred_data = kgr_model4_outfit$fitted_values)

Proposed model 5

#Fit kgr_model5
kgr_model5_outfit = kgr_model5(dataset = inla_outsample_data, rho_AR_rbf = 0.014, rho_AR_periodic = 0.010,
                            rho_DL_rbf = 0.006, rho_DL_periodic = 0.004, rho_int_rbf = 0.011,
                            rho_int_periodic = 0.013, sigma2_AR = 3.751, sigma2_DL = 4.269, sigma2_int = 4.979, link=1)

#Extract DIC and WAIC 
kgr_model5_DIC = kgr_model5_outfit$model_DIC
kgr_model5_WAIC = kgr_model5_outfit$model_WAIC

#Get summaries of parameter estimates
kgr_model5_outfit$model_summary
##                 mean       sd 0.025quant  0.5quant 0.975quant      mode
## months1    1.2786744 8.451523  -15.29607 1.2786744   17.85342 1.2786744
## months2    1.0596551 8.451523  -15.51509 1.0596551   17.63440 1.0596551
## months3    1.0861771 8.451521  -15.48856 1.0861771   17.66092 1.0861771
## months4    0.8970728 8.451523  -15.67767 0.8970728   17.47181 0.8970728
## months5    0.8140420 8.451524  -15.76070 0.8140420   17.38878 0.8140420
## months6    0.6833944 8.451525  -15.89135 0.6833944   17.25814 0.6833944
## months7    0.6555029 8.451535  -15.91926 0.6555029   17.23027 0.6555029
## months8    0.6329792 8.451536  -15.94179 0.6329792   17.20775 0.6329792
## months9    0.5863157 8.451536  -15.98845 0.5863157   17.16108 0.5863157
## months10   0.6676500 8.451535  -15.90712 0.6676500   17.24242 0.6676500
## months11   0.7137477 8.451535  -15.86102 0.7137477   17.28851 0.7137477
## months12   0.9618791 8.451533  -15.61288 0.9618791   17.53664 0.9618791
## Intercept1 6.3986491 8.451491  -10.17603 6.3986491   22.97333 6.3986491
## Intercept2 3.6384413 8.451501  -12.93626 3.6384413   20.21314 3.6384413
##                     kld
## months1    5.527825e-11
## months2    5.527846e-11
## months3    5.527828e-11
## months4    5.527826e-11
## months5    5.527844e-11
## months6    5.527842e-11
## months7    5.527849e-11
## months8    5.527828e-11
## months9    5.527848e-11
## months10   5.527829e-11
## months11   5.527830e-11
## months12   5.527812e-11
## Intercept1 5.527822e-11
## Intercept2 5.527835e-11
kgr_model5_outfit$bri_hyperpar_summary
##                 mean         sd    q0.025     q0.5    q0.975      mode
## SD for id2 0.5813778 0.06324085 0.4688161 0.577201 0.7170021 0.5695469
kgr_model5_outfit$exp_effects
##    months1    months2    months3    months4    months5    months6    months7 
##   3.591875   2.885376   2.962925   2.452414   2.257012   1.980589   1.926111 
##    months8    months9   months10   months11   months12 Intercept1 Intercept2 
##   1.883213   1.797354   1.949650   2.041628   2.616609 601.032573  38.032508
kgr_model5_outfit$K_weight
## [1] 0.9365558
kgr_model5_outfit$gfilter_weight
## [1] 0.06344417
#Show plots
# heatmap(t(kgr_model5_outfit$prec))
# kgr_model5_outfit$K_AR_heatmap
# kgr_model5_outfit$K_DL_heatmap
# kgr_model5_outfit$K_Interaction_heatmap
# kgr_model5_outfit$prec_heatmap
kgr_model5_outfit$param_plot

kgr_model5_outfit$hyperparam_plot

pp_outsample_plot(pred_data = kgr_model5_outfit$fitted_values)

Comparing out of sample MAE, MASE, and MAPE for different clusters between models

print(degree_connectivity)
##   c(1:num_clus) node_connections
## 1             1                1
## 2             2                1

To compare out of sample fit, we calculate a variety of metrics. The first three, MAE, MASE, and MAPE specifically evaluate forecast accuracy i.e. the prediction accuracy for the time points t=55,…,60. The last one is RMSE which is calculated based on the entire sample to get an idea of wholistic fit. Note that once again, we scale the RMSEs so that they can be interpreted relative to the average number of respiratory related deaths in that cluster. Lastly, we also calculate a Frequentist coverage rate based on the credible intervals produced by INLA to get an idea of our models’ uncertainty quantification.

Mean absolute error (MAE) is a measure of the average size of the mistakes in a collection of predictions, without taking their direction into account

MAE = \(\frac{1}{h_{max}} \sum_{h=1}^{h_{max}} | \hat \lambda_{t+h}^{obs} - \hat \lambda_{t+h}|\) where \(\hat \lambda_{t+h}^{obs}\) is the average of the number of deaths observed for month \(t+h\) over all years

MAE_table = matrix(nrow=7,ncol=num_clus)

for (i in 1:num_clus){
  actual = inla_full_data %>% filter(id == i)  %>% data.frame()

  pm_1 = ref_model2_outfit$fitted_values %>% filter(id == i) %>% select(mean,sd) %>% data.frame()
  pm_2 = ref_model3_outfvs %>% filter(id == i) %>% select(mean,sd) %>% data.frame()
  pm_3 = kgr_model1_outfit$fitted_values %>% filter(id == i) %>% select(mean,sd) %>% data.frame()
  pm_4 = kgr_model2_outfit$fitted_values %>% filter(id == i) %>% select(mean,sd) %>% data.frame()
  pm_5 = kgr_model3_outfit$fitted_values %>% filter(id == i) %>% select(mean,sd) %>% data.frame()
  pm_6 = kgr_model4_outfit$fitted_values %>% filter(id == i) %>% select(mean,sd) %>% data.frame()
  pm_7 = kgr_model5_outfit$fitted_values %>% filter(id == i) %>% select(mean,sd) %>% data.frame()
  
  actual_test = c()
  
  for (j in 7:12){
    actual_j = actual %>% filter(months == j) %>% select(response)
    est_lambda = mean(actual_j$response)
    actual_test = c(actual_test,as.numeric(est_lambda))
  }

  pm_1_test = pm_1[55:60,]
  pm_2_test = pm_2[55:60,]
  pm_3_test = pm_3[55:60,]
  pm_4_test = pm_4[55:60,]
  pm_5_test = pm_5[55:60,]
  pm_6_test = pm_6[55:60,]
  pm_7_test = pm_7[55:60,] 

  actual_test_mean = mean(actual_test)

  mae1 = mean(abs(actual_test - pm_1_test$mean))
  mae2 = mean(abs(actual_test - pm_2_test$mean))
  mae3 = mean(abs(actual_test - pm_3_test$mean))
  mae4 = mean(abs(actual_test - pm_4_test$mean))
  mae5 = mean(abs(actual_test - pm_5_test$mean))
  mae6 = mean(abs(actual_test - pm_6_test$mean))
  mae7 = mean(abs(actual_test - pm_7_test$mean))

  MAE_table[,i] = c(mae1,mae2,mae3,mae4,mae5,mae6,mae7)
  # MAE_table[,i] = MAE_table[,i] / actual_test_mean
}

#Table 2: MAE on test dataset

MAE_table = data.frame(MAE_table)

colnames(MAE_table) = c("Cluster 1","Cluster 2")
rownames(MAE_table) = c("BYM model","LGCP model","Proposed KGR model 1",
                         "Proposed KGR model 2","Proposed KGR model 3",
                         "Proposed KGR model 4","Proposed KGR model 5")

MAE_table
##                       Cluster 1 Cluster 2
## BYM model              9.921266  3.295392
## LGCP model           124.199948  4.267689
## Proposed KGR model 1  17.285155  2.604157
## Proposed KGR model 2  16.049136  2.853820
## Proposed KGR model 3  13.728338  2.625709
## Proposed KGR model 4  13.719393  2.614742
## Proposed KGR model 5   7.870392  3.069713

Mean absolute scaled error (MASE) is a measure of the accuracy of forecasts. It is the mean absolute error of the forecast values, divided by the mean absolute error of the in-sample one-step naive forecast.

MASE = \(\frac{\frac{1}{n} \sum_{t=1}^n |\hat \lambda_{t+h}^{obs} - \hat \lambda_{t+h}|}{\frac{1}{n-1} \sum_{t=2}^n |\hat \lambda_{t+h}^{obs} - \hat \lambda_{t+h-1}^{obs}|}\)

MASE_table = matrix(nrow=7,ncol=num_clus)

for (i in 1:num_clus){
  actual = inla_full_data %>% filter(id == i)  %>% data.frame()

  pm_1 = ref_model2_outfit$fitted_values %>% filter(id == i) %>% select(mean,sd) %>% data.frame()
  pm_2 = ref_model3_outfvs %>% filter(id == i) %>% select(mean,sd) %>% data.frame()
  pm_3 = kgr_model1_outfit$fitted_values %>% filter(id == i) %>% select(mean,sd) %>% data.frame()
  pm_4 = kgr_model2_outfit$fitted_values %>% filter(id == i) %>% select(mean,sd) %>% data.frame()
  pm_5 = kgr_model3_outfit$fitted_values %>% filter(id == i) %>% select(mean,sd) %>% data.frame()
  pm_6 = kgr_model4_outfit$fitted_values %>% filter(id == i) %>% select(mean,sd) %>% data.frame()
  pm_7 = kgr_model5_outfit$fitted_values %>% filter(id == i) %>% select(mean,sd) %>% data.frame()
  
  actual_test = c()
  
  for (j in 6:12){
    actual_j = actual %>% filter(months == j) %>% select(response)
    est_lambda = mean(actual_j$response)
    actual_test = c(actual_test,as.numeric(est_lambda))
  }

  pm_1_test = pm_1$mean[54:60]
  pm_2_test = pm_2$mean[54:60]
  pm_3_test = pm_3$mean[54:60]
  pm_4_test = pm_4$mean[54:60]
  pm_5_test = pm_5$mean[54:60]
  pm_6_test = pm_6$mean[54:60]
  pm_7_test = pm_7$mean[54:60] 
  actual_test_mean = mean(actual_test)
  
  values1 = c()
  values2 = c()
  values3 = c()
  values4 = c()
  values5 = c()
  values6 = c()
  values7 = c()
  
  for (j in 2:length(actual_test)){
    error1 = (abs(actual_test[j] - pm_1_test[j])) / (abs(actual_test[j] - actual_test[j-1])) / (length(actual_test)-1)
    values1 = c(values1,error1)
    
    error2 = (abs(actual_test[j] - pm_2_test[j])) / (abs(actual_test[j] - actual_test[j-1])) / (length(actual_test)-1)
    values2 = c(values2,error2)
    
    error3 = (abs(actual_test[j] - pm_3_test[j])) / (abs(actual_test[j] - actual_test[j-1])) / (length(actual_test)-1)
    values3 = c(values3,error3)
    
    error4 = (abs(actual_test[j] - pm_4_test[j])) / (abs(actual_test[j] - actual_test[j-1])) / (length(actual_test)-1)
    values4 = c(values4,error4)
    
    error5 = (abs(actual_test[j] - pm_5_test[j])) / (abs(actual_test[j] - actual_test[j-1])) / (length(actual_test)-1)
    values5 = c(values5,error5)
    
    error6 = (abs(actual_test[j] - pm_6_test[j])) / (abs(actual_test[j] - actual_test[j-1])) / (length(actual_test)-1)
    values6 = c(values6,error6)
    
    error7 = (abs(actual_test[j] - pm_7_test[j])) / (abs(actual_test[j] - actual_test[j-1])) / (length(actual_test)-1)
    values7 = c(values7,error7)
  }
  
  mase1 = mean(values1)
  mase2 = mean(values2)
  mase3 = mean(values3)
  mase4 = mean(values4)
  mase5 = mean(values5)
  mase6 = mean(values6)
  mase7 = mean(values7)

  MASE_table[,i] = c(mase1,mase2,mase3,mase4,mase5,mase6,mase7)
  # MASE_table[,i] = MASE_table[,i] / actual_test_mean
}

#Table 3: MASE on test dataset

MASE_table = data.frame(MASE_table)

colnames(MASE_table) = c("Cluster 1","Cluster 2")
rownames(MASE_table) = c("BYM model","LGCP model","Proposed KGR model 1",
                         "Proposed KGR model 2","Proposed KGR model 3",
                         "Proposed KGR model 4","Proposed KGR model 5")

MASE_table
##                       Cluster 1 Cluster 2
## BYM model            0.02649697 0.1398932
## LGCP model           0.37282141 0.3534943
## Proposed KGR model 1 0.04966609 0.2040884
## Proposed KGR model 2 0.05610354 0.2139280
## Proposed KGR model 3 0.03326008 0.1333344
## Proposed KGR model 4 0.03684080 0.1633988
## Proposed KGR model 5 0.02415540 0.2237026

Mean absolute percentage error (MAPE), also known as mean absolute percentage deviation (MAPD), is a measure of prediction accuracy of a forecasting method in statistics, expressing accuracy as a ratio

MAPE = \(\frac{100}{n} \sum_{t=1}^n |\frac{\hat \lambda_{t+h}^{obs} - \hat \lambda_{t+h}}{\hat \lambda_{t+h}^{obs}}|\)

MAPE_table = matrix(nrow=7,ncol=num_clus)

for (i in 1:num_clus){
  actual = inla_full_data %>% filter(id == i)  %>% data.frame()

  pm_1 = ref_model2_outfit$fitted_values %>% filter(id == i) %>% select(mean,sd) %>% data.frame()
  pm_2 = ref_model3_outfvs %>% filter(id == i) %>% select(mean,sd) %>% data.frame()
  pm_3 = kgr_model1_outfit$fitted_values %>% filter(id == i) %>% select(mean,sd) %>% data.frame()
  pm_4 = kgr_model2_outfit$fitted_values %>% filter(id == i) %>% select(mean,sd) %>% data.frame()
  pm_5 = kgr_model3_outfit$fitted_values %>% filter(id == i) %>% select(mean,sd) %>% data.frame()
  pm_6 = kgr_model4_outfit$fitted_values %>% filter(id == i) %>% select(mean,sd) %>% data.frame()
  pm_7 = kgr_model5_outfit$fitted_values %>% filter(id == i) %>% select(mean,sd) %>% data.frame()

  actual_test = c()
  
  for (j in 7:12){
    actual_j = actual %>% filter(months == j) %>% select(response)
    est_lambda = mean(actual_j$response)
    actual_test = c(actual_test,as.numeric(est_lambda))
  }
  
  pm_1_test = pm_1[55:60,]
  pm_2_test = pm_2[55:60,]
  pm_3_test = pm_3[55:60,]
  pm_4_test = pm_4[55:60,]
  pm_5_test = pm_5[55:60,]
  pm_6_test = pm_6[55:60,]
  pm_7_test = pm_7[55:60,] 

  actual_test_mean = mean(actual_test)

  mape1 = mean(abs((actual_test - pm_1_test$mean)/actual_test))
  mape2 = mean(abs((actual_test - pm_2_test$mean)/actual_test))
  mape3 = mean(abs((actual_test - pm_3_test$mean)/actual_test))
  mape4 = mean(abs((actual_test - pm_4_test$mean)/actual_test))
  mape5 = mean(abs((actual_test - pm_5_test$mean)/actual_test))
  mape6 = mean(abs((actual_test - pm_6_test$mean)/actual_test))
  mape7 = mean(abs((actual_test - pm_7_test$mean)/actual_test))

  MAPE_table[,i] = c(mape1,mape2,mape3,mape4,mape5,mape6,mape7)
  # MAPE_table[,i] = MAPE_table[,i] / actual_test_mean
}

#Table 2: RMSE on test dataset

MAPE_table = data.frame(MAPE_table)

colnames(MAPE_table) = c("Cluster 1","Cluster 2")
rownames(MAPE_table) = c("BYM model","LGCP model","Proposed KGR model 1",
                         "Proposed KGR model 2","Proposed KGR model 3",
                         "Proposed KGR model 4","Proposed KGR model 5")

MAPE_table
##                        Cluster 1  Cluster 2
## BYM model            0.008175899 0.04217006
## LGCP model           0.102717608 0.05563386
## Proposed KGR model 1 0.014431803 0.03280457
## Proposed KGR model 2 0.013563124 0.03725918
## Proposed KGR model 3 0.011044404 0.03337929
## Proposed KGR model 4 0.011351979 0.03305294
## Proposed KGR model 5 0.006700932 0.03966520
#Overall fit
test_RMSE_table = matrix(nrow=7,ncol=num_clus)

for (i in 1:num_clus){
  actual = inla_full_data %>% filter(id == i)  %>% data.frame()

  pm_1 = ref_model2_outfit$fitted_values %>% filter(id == i) %>% select(mean,sd) %>% data.frame()
  pm_2 = ref_model3_outfvs %>% filter(id == i) %>% select(mean,sd) %>% data.frame()
  pm_3 = kgr_model1_outfit$fitted_values %>% filter(id == i) %>% select(mean,sd) %>% data.frame()
  pm_4 = kgr_model2_outfit$fitted_values %>% filter(id == i) %>% select(mean,sd) %>% data.frame()
  pm_5 = kgr_model3_outfit$fitted_values %>% filter(id == i) %>% select(mean,sd) %>% data.frame()
  pm_6 = kgr_model4_outfit$fitted_values %>% filter(id == i) %>% select(mean,sd) %>% data.frame()
  pm_7 = kgr_model5_outfit$fitted_values %>% filter(id == i) %>% select(mean,sd) %>% data.frame()

  actual_test = c()
  
  for (j in 7:12){
    actual_j = actual %>% filter(months == j) %>% select(response)
    est_lambda = mean(actual_j$response)
    actual_test = c(actual_test,as.numeric(est_lambda))
  }
  
  pm_1_test = pm_1[55:60,]
  pm_2_test = pm_2[55:60,]
  pm_3_test = pm_3[55:60,]
  pm_4_test = pm_4[55:60,]
  pm_5_test = pm_5[55:60,]
  pm_6_test = pm_6[55:60,]
  pm_7_test = pm_7[55:60,] 

  actual_test_mean = mean(actual_test)

  test_rmse1 = sqrt(mean((actual_test - pm_1_test$mean)^2))
  test_rmse2 = sqrt(mean((actual_test - pm_2_test$mean)^2))
  test_rmse3 = sqrt(mean((actual_test - pm_3_test$mean)^2))
  test_rmse4 = sqrt(mean((actual_test - pm_4_test$mean)^2))
  test_rmse5 = sqrt(mean((actual_test - pm_5_test$mean)^2))
  test_rmse6 = sqrt(mean((actual_test - pm_6_test$mean)^2))
  test_rmse7 = sqrt(mean((actual_test - pm_7_test$mean)^2))
  
  test_RMSE_table[,i] = c(test_rmse1,test_rmse2,test_rmse3,test_rmse4,
                          test_rmse5,test_rmse6,test_rmse7)
  test_RMSE_table[,i] = test_RMSE_table[,i] / actual_test_mean
}

#Table 2: RMSE on test dataset

test_RMSE_table = data.frame(test_RMSE_table)

colnames(test_RMSE_table) = c("Cluster 1","Cluster 2")
rownames(test_RMSE_table) = c("BYM model","LGCP model","Proposed KGR model 1",
                         "Proposed KGR model 2","Proposed KGR model 3",
                         "Proposed KGR model 4","Proposed KGR model 5")

test_RMSE_table
##                        Cluster 1  Cluster 2
## BYM model            0.008885734 0.04998013
## LGCP model           0.104536900 0.07820596
## Proposed KGR model 1 0.016409611 0.03971802
## Proposed KGR model 2 0.016019426 0.04393729
## Proposed KGR model 3 0.013151129 0.03903035
## Proposed KGR model 4 0.013590642 0.03953637
## Proposed KGR model 5 0.007964804 0.04441662

Comparing coverage rates of credible intervals produced by each model

coverage = rep(0,7)
true_values = inla_full_data$response
models_fvs = list(ref_model2_outfit$fitted_values,ref_model3_outfvs,kgr_model1_outfit$fitted_values,
                  kgr_model2_outfit$fitted_values,kgr_model3_outfit$fitted_values,kgr_model4_outfit$fitted_values,kgr_model5_outfit$fitted_values)

for (i in 1:7){
  lci = models_fvs[[i]] %>% select('0.025quant')
  uci = models_fvs[[i]] %>% select('0.975quant')
  
  captured = (true_values >= lci$'0.025quant' & true_values <= uci$'0.975quant')
  coverage[i] = sum(captured)/length(captured)
}

coverage = data.frame(coverage)
colnames(coverage) = "95% coverage"
rownames(coverage) = c("BYM model","LGCP model","Proposed KGR model 1",
                         "Proposed KGR model 2","Proposed KGR model 3","Proposed KGR model 4","Proposed KGR model 5")
coverage
##                      95% coverage
## BYM model               0.3166667
## LGCP model              0.1833333
## Proposed KGR model 1    0.9416667
## Proposed KGR model 2    0.9416667
## Proposed KGR model 3    0.9416667
## Proposed KGR model 4    0.9416667
## Proposed KGR model 5    0.9250000

Plotting heatmaps of CA with estimated mean and variance of intensity function for each model

#Plot heatmap for time = 60
#clusterlabels$counties = tolower(clusterlabels$counties)
colnames(clusterlabels) = c("subregion","cluster")
merged_response = join(ca_map,clusterlabels,by = "subregion")

true_values = inla_full_data %>% filter(time == 60) %>% select(id,response)
colnames(true_values) = c("cluster","response")
merged_response = join(merged_response,true_values,by = "cluster")

heatmap_limits = c(0,2000)
legend_titles = c("Ref model 2 fitted values","Ref model 3 fitted values","Prop model 1 fitted values",
                  "Prop model 2 fitted values","Prop model 3 fitted values","Prop model 4 fitted values")

#Plot of observed mortality
gg_pop <- ggplot() +
  geom_polygon(data = merged_response, aes(x = long, y = lat, group = group, fill = response), 
               color = "black") +
  coord_fixed(ratio = 1.3, xlim = c(-125, -112), ylim = c(30, 42)) +
  scale_fill_viridis_c(limits = heatmap_limits, name = "Observed mortality") +
  theme_void() +
    labs(title = "Observed Mortality (Dec 2018) for CA",
         x = "Longitude",
         y = "Latitude")

print(gg_pop)

Plotting heatmaps of mean of intensity function

models_intensity_fvs = list(ref_model1_outfit$fitted_values,ref_model2_outfit$fitted_values,ref_model3_outfvs,
                            kgr_model1_outfit$fitted_values,kgr_model2_outfit$fitted_values,kgr_model3_outfit$fitted_values,
                            kgr_model4_outfit$fitted_values,kgr_model5_outfit$fitted_values)

merged_response = join(ca_map,clusterlabels,by = "subregion")

legend_titles = c("Ref model 1 fitted values","Ref model 2 fitted values","Ref model 3 fitted values",
                  "Prop model 1 fitted values","Prop model 2 fitted values","Prop model 3 fitted values",
                  "Prop model 4 fitted values","Prop model 5 fitted values")

for (i in 1:8){
  fitted_values = models_intensity_fvs[[i]] %>% filter(time == 60) %>% select(id,mean)
  heatmap_limits = c(0,1.5*max(fitted_values$mean))
  colname = sprintf("prediction.%s",i)
  colnames(fitted_values) = c("cluster",colname)
  merged_response = join(merged_response,fitted_values,by = "cluster")
  
  #Heatmap of each model's fvs 
  gg_pop <- ggplot() +
    geom_polygon(data = merged_response, aes(x = long, y = lat, group = group, fill = merged_response[,i+7]), 
                 color = "black") +
    coord_fixed(ratio = 1.3, xlim = c(-125, -112), ylim = c(30, 42)) +
    scale_fill_viridis_c(limits = heatmap_limits, name = legend_titles[i]) +
    theme_void() +
      labs(title = "Predicted Mean (Dec 2019) for CA",
         x = "Longitude",
         y = "Latitude")
  
  print(gg_pop)
}

Plotting heatmaps of variance of intensity function

merged_response = join(ca_map,clusterlabels,by = "subregion")

legend_titles = c("Ref model 1 variance","Ref model 2 variance","Ref model 3 variance",
                  "Prop model 1 variance","Prop model 2 variance","Prop model 3 variance",
                  "Prop model 4 variance","Prop model 5 variance")

for (j in 1:8){
  fitted_values = models_intensity_fvs[[j]] %>% filter(time == 60) %>% select(id,sd)
  fitted_values$sd = fitted_values$sd^2
  heatmap_limits = c(0,1.5*max(fitted_values$sd))
  colname = sprintf("prediction.%s",j)
  colnames(fitted_values) = c("cluster",colname)
  merged_response = join(merged_response,fitted_values,by = "cluster")
  
  #Heatmap of each model's fvs 
  gg_pop <- ggplot() +
    geom_polygon(data = merged_response, aes(x = long, y = lat, group = group, fill = merged_response[,j+7]), 
                 color = "black") +
    coord_fixed(ratio = 1.3, xlim = c(-125, -112), ylim = c(30, 42)) +
    scale_fill_viridis_c(limits = heatmap_limits, name = legend_titles[j]) +
    theme_void() +
      labs(title = "Dec 2019",
         x = "Longitude",
         y = "Latitude")
  
  print(gg_pop)
}

Sliding timeframe forecasting exercise:

One step ahead forecasting

In this section, I implemented a forecasting exercise in which I start with 36 months of training data. I will use that data to estimate a model and then forecast one month ahead. Using our starting data AND the forecasted values, I will re-estimate the model and forecast again to get the next month’s predictions. This process continues until the original 36 months of data have been used to produce a complete time series of 60 months (the last 3 years are forecasted). This exercise gives us another way to compare the predictive ability of our various models.

Reference model

starting_data = inla_full_data[1:72,] #378 for last 6 months 
starting_data$months = as.numeric(starting_data$months)
rownames(starting_data) = NULL

MAE = c()
MAPE = c()
RMSPE = c()

while(max(starting_data$time) < 60){
  
  ###Attach df for next month with NAs in response
  end = nrow(starting_data)
  id = c(1,2)
  id2 = (starting_data$id2[end]+1):(starting_data$id2[end]+2)
  response = rep(NA,2)
  time = rep(starting_data$time[end]+1,each=2)
  Intercept1 = c(1,NA)
  Intercept2 = c(NA,1)
  
  if (starting_data$months[end] == 12){
    months = rep(1,2)
  } else{
    value = starting_data$months[end]
    months = rep((value+1),2)
  }
  
  new_data = data.frame(id,id2,response,time,months,Intercept1,Intercept2)
  starting_data = rbind(starting_data,new_data)
  starting_data$months = factor(starting_data$months)
  
  ###Fit reference model

  #Can set a fixed initial value for hyperprior if you want to fix hyperparameters
  # prec_prior <- list(prec = list(prior = "loggamma", param = c(0.01,0.01), inital = 2.771018, fixed=TRUE))
  
    ref_formula2 = response ~ -1 + months + Intercept1 + Intercept2 + f(id, model = "bym", graph = huge.est) 
    
  ref_model2 = inla(ref_formula2,family = "poisson",data = starting_data,
                  control.compute = list(dic=TRUE,waic=TRUE),
                  control.predictor = list(compute = TRUE, link = 1))
  
  ###Append ref model 2 predictions to starting data
  preds_ref_model2 = ref_model2$summary.fitted.values
  preds_ref_model2$mean = round(preds_ref_model2$mean)
  end2 = nrow(preds_ref_model2)

  pred_data = preds_ref_model2$mean[(end2-1):end2]
  starting_data$response[(end+1):(end+2)] = pred_data
  
  # starting_data$response = replace(starting_data$response,which(starting_data$response < 0),0)
  starting_data$months = as.numeric(starting_data$months)
  
  ###Calculate performance metrics at each step (avg over all clusters)
  est_lambda = rep(1,2)
  
  for (c in 1:num_clus){
  actual = inla_full_data %>% filter(id == c)  %>% data.frame()
  m = unique(months)
  
  actual_m = actual %>% filter(months == m) %>% select(response)
  est_lambda[c] = mean(actual_m$response)
  }
  
  MAE = c(MAE,mean(abs(est_lambda - pred_data)))
  MAPE = c(MAPE,mean(abs((est_lambda - pred_data)/est_lambda)))
  RMSPE = c(RMSPE,sqrt(mean((est_lambda - pred_data)^2)))
}


###Have to calculate MASE for each time point separately
MASE = c()
m = 12

est_lambdas_df = rep(1,2)
pred_test_df = rep(1,2)

for (j in 36:60){
  for (c in 1:num_clus){
    actual = starting_data %>% filter(id == c)  %>% data.frame()
    actual_m = actual %>% filter(months == m,time < j) %>% select(response)
    est_lambda[c] = mean(actual_m$response)
  }
  
  pred_test = starting_data %>% filter(time == j) %>% select(response) %>% as.matrix() %>% c()
  
  pred_test_df = rbind(pred_test_df,pred_test)
  est_lambdas_df = rbind(est_lambdas_df,est_lambda)
  
    if (m == 12){
  m = 1
  } else{
  m = value+1
  }
}

est_lambdas_df = est_lambdas_df[-1,]
pred_test_df = pred_test_df[-1,]

for (j in 2:nrow(pred_test_df)){
  error = (abs(est_lambdas_df[j,] - pred_test_df[j,])) / (abs(est_lambdas_df[j,] - est_lambdas_df[(j-1),])) / (nrow(est_lambdas_df)-1)
  MASE = c(MASE,mean(error))
}


ref_model_error = cbind(MAE,MASE,MAPE,RMSPE)
ref_model_error
##        MAE        MASE       MAPE     RMSPE
##  [1,] 60.3 0.002484030 0.03650599 83.458852
##  [2,] 15.8 0.014685701 0.01080008 22.063545
##  [3,]  8.7 0.028415913 0.03482330  8.769265
##  [4,] 12.5 0.009661302 0.02760487 15.116216
##  [5,] 12.1 0.060781330 0.02353614 15.261062
##  [6,]  4.6 0.032435032 0.03091740  4.617359
##  [7,]  4.9 0.077876489 0.02404973  5.255473
##  [8,]  7.0 0.039141325 0.01151189  9.350936
##  [9,] 18.7 0.082481231 0.06230002 22.493110
## [10,]  2.4 0.034997667 0.01298351  2.473863
## [11,]  2.5 0.072751219 0.01710814  2.501999
## [12,] 32.8 0.004642714 0.06850193 40.175614
## [13,] 60.3 0.001953506 0.03650599 83.458852
## [14,] 15.8 0.013850983 0.01080008 22.063545
## [15,]  8.7 0.029151950 0.03482330  8.769265
## [16,] 12.5 0.010722947 0.02760487 15.116216
## [17,] 12.1 0.061976990 0.02353614 15.261062
## [18,]  4.6 0.033822717 0.03091740  4.617359
## [19,]  4.9 0.079322473 0.02404973  5.255473
## [20,]  7.0 0.040624878 0.01151189  9.350936
## [21,] 18.7 0.083982921 0.06230002 22.493110
## [22,]  2.4 0.036424074 0.01298351  2.473863
## [23,]  2.5 0.074119759 0.01710814  2.501999
## [24,] 32.8 0.003593823 0.06850193 40.175614
#Plot of posterior predictive estimates (months 37-60) with credible interval bands OVERLAID on response
true_mortality = inla_full_data

for (i in 1:num_clus){
  df = true_mortality %>% filter(id == i) %>% select(response)
  preds = starting_data %>% filter(id == i) 
  colnames(preds)[3] = "mean"
  df = cbind(df,preds)
  title = sprintf("Cluster %s",i)
  
  post_pred_plot = df %>% ggplot(aes(x=time,y=response)) + geom_point() + 
    geom_line(aes(y=mean),color = "red") + geom_vline(xintercept = 36,linetype = "dashed",color = "blue",linewidth = 1.5) + ggtitle(title)
  print(post_pred_plot)
}

Proposed model

inv_covGP = kgr_model2_outfit$prec
inv_covGP3 = kgr_model3_outfit$prec
inv_covGP4 = kgr_model4_outfit$prec
inv_covGP5 = kgr_model5_outfit$prec

starting_data = inla_full_data[1:72,] #378 for last 6 months 
starting_data$months = as.numeric(starting_data$months)
rownames(starting_data) = NULL

MAE2 = c()
MAPE2 = c()
RMSPE2 = c()

while(max(starting_data$time) < 60){
  
  ###Attach df for next month with NAs in response
  end = nrow(starting_data)
  id = c(1,2)
  id2 = (starting_data$id2[end]+1):(starting_data$id2[end]+2)
  response = rep(NA,2)
  time = rep(starting_data$time[end]+1,each=2)
  Intercept1 = c(1,NA)
  Intercept2 = c(NA,1)
  
  if (starting_data$months[end] == 12){
    months = rep(1,2)
  } else{
    value = starting_data$months[end]
    months = rep((value+1),2)
  }
  
  new_data = data.frame(id,id2,response,time,months,Intercept1,Intercept2)
  starting_data = rbind(starting_data,new_data)
  starting_data$months = factor(starting_data$months)
  
  ###Fit KGR model

  #Can set a fixed initial value for hyperprior if you want to fix hyperparameters
  prec_prior <- list(prec = list(prior = "loggamma", param = c(0.01,0.01), inital = 2.771018, fixed=TRUE))
  
  # #Proposed model 2
  # kgr_formula2 = response ~ -1 + months + Intercept1 + Intercept2 + f(id,model = "generic0",Cmatrix = inv_covGP[c(starting_data$id2),c(starting_data$id2)])
  # 
  # kgr_model2 = inla(kgr_formula2, data = starting_data_subset, family = "poisson",
  #                   control.predictor = list(compute = TRUE, link = 1))
  # 
  # preds_kgr_model = kgr_model2$summary.fitted.values
  
  # #Proposed model 3
  # kgr_formula3 = response ~ -1 + months + Intercept1 + Intercept2 + f(id,model = "generic0",Cmatrix = inv_covGP3[c(starting_data$id2),c(starting_data$id2)])
  # 
  # kgr_model3 = inla(kgr_formula3, data = starting_data_subset, family = "poisson",
  #                   control.predictor = list(compute = TRUE, link = 1))
  # 
  # preds_kgr_model = kgr_model3$summary.fitted.values
  
  # #Proposed model 4
  kgr_formula4 = response ~ -1 + months + Intercept1 + Intercept2 + f(id,model = "generic0",Cmatrix = inv_covGP4[c(starting_data$id2),c(starting_data$id2)])

  kgr_model4 = inla(kgr_formula4, data = starting_data, family = "poisson",
                    control.predictor = list(compute = TRUE, link = 1))

  preds_kgr_model = kgr_model4$summary.fitted.values
  
  #Proposed model 5
  # kgr_formula5 = response ~ -1 + months + Intercept1 + Intercept2 + f(id,model = "generic0",Cmatrix = inv_covGP5[c(starting_data$id2),c(starting_data$id2)], hyper = prec_prior)
  # 
  # kgr_model5 = inla(kgr_formula5, data = starting_data, family = "poisson",
  #                   control.inla = list(strategy = "laplace"),
  #                   control.predictor = list(compute = TRUE, link = 1)) 
  # 
  # preds_kgr_model = kgr_model5$summary.fitted.values
  
  ###Append KGR model predictions to starting data
  preds_kgr_model$mean = round(preds_kgr_model$mean)
  end2 = nrow(preds_kgr_model)

  pred_data = preds_kgr_model$mean[(end2-1):end2]
  starting_data$response[(end+1):(end+2)] = pred_data
  
  # starting_data$response = replace(starting_data$response,which(starting_data$response < 0),0)
  starting_data$months = as.numeric(starting_data$months)
  
  ###Calculate performance metrics at each step (avg over all clusters)
  est_lambda = rep(1,2)
  
  for (c in 1:num_clus){
  actual = inla_full_data %>% filter(id == c)  %>% data.frame()
  m = unique(months)
  
  actual_m = actual %>% filter(months == m) %>% select(response)
  est_lambda[c] = mean(actual_m$response)
  }
  
  MAE2 = c(MAE2,mean(abs(est_lambda - pred_data)))
  MAPE2 = c(MAPE2,mean(abs((est_lambda - pred_data)/est_lambda)))
  RMSPE2 = c(RMSPE2,sqrt(mean((est_lambda - pred_data)^2)))
}


###Have to calculate MASE for each time point separately
MASE2 = c()
m = 12

est_lambdas_df = rep(1,2)
pred_test_df = rep(1,2)

for (j in 36:60){
  for (c in 1:num_clus){
    actual = starting_data %>% filter(id == c)  %>% data.frame()
    actual_m = actual %>% filter(months == m,time < j) %>% select(response)
    est_lambda[c] = mean(actual_m$response)
  }
  
  pred_test = starting_data %>% filter(time == j) %>% select(response) %>% as.matrix() %>% c()
  
  pred_test_df = rbind(pred_test_df,pred_test)
  est_lambdas_df = rbind(est_lambdas_df,est_lambda)
  
    if (m == 12){
  m = 1
  } else{
  m = value+1
  }
}

est_lambdas_df = est_lambdas_df[-1,]
pred_test_df = pred_test_df[-1,]

for (j in 2:nrow(pred_test_df)){
  error = (abs(est_lambdas_df[j,] - pred_test_df[j,])) / (abs(est_lambdas_df[j,] - est_lambdas_df[(j-1),])) / (nrow(est_lambdas_df)-1)
  MASE2 = c(MASE2,mean(error))
}

prop_model_error = cbind(MAE2,MASE2,MAPE2,RMSPE2)
prop_model_error
##       MAE2       MASE2      MAPE2    RMSPE2
##  [1,] 60.3 0.002484030 0.03650599 83.458852
##  [2,] 15.8 0.014685701 0.01080008 22.063545
##  [3,]  8.7 0.028415913 0.03482330  8.769265
##  [4,] 12.5 0.009661302 0.02760487 15.116216
##  [5,] 12.1 0.060781330 0.02353614 15.261062
##  [6,]  4.6 0.032435032 0.03091740  4.617359
##  [7,]  4.9 0.077876489 0.02404973  5.255473
##  [8,]  7.0 0.039141325 0.01151189  9.350936
##  [9,] 18.7 0.082481231 0.06230002 22.493110
## [10,]  2.4 0.034997667 0.01298351  2.473863
## [11,]  2.5 0.072751219 0.01710814  2.501999
## [12,] 32.8 0.004642714 0.06850193 40.175614
## [13,] 60.3 0.001953506 0.03650599 83.458852
## [14,] 15.8 0.013850983 0.01080008 22.063545
## [15,]  8.7 0.029151950 0.03482330  8.769265
## [16,] 12.5 0.010722947 0.02760487 15.116216
## [17,] 12.1 0.061976990 0.02353614 15.261062
## [18,]  4.6 0.033822717 0.03091740  4.617359
## [19,]  4.9 0.079322473 0.02404973  5.255473
## [20,]  7.0 0.040624878 0.01151189  9.350936
## [21,] 18.7 0.083982921 0.06230002 22.493110
## [22,]  2.4 0.036424074 0.01298351  2.473863
## [23,]  2.5 0.074119759 0.01710814  2.501999
## [24,] 32.8 0.003593823 0.06850193 40.175614
#Plot of posterior predictive estimates (months 37-60) with credible interval bands OVERLAID on response
true_mortality = inla_full_data

for (i in 1:num_clus){
  df = true_mortality %>% filter(id == i) %>% select(response)
  preds = starting_data %>% filter(id == i) 
  colnames(preds)[3] = "mean"
  df = cbind(df,preds)
  title = sprintf("Cluster %s",i)
  
  post_pred_plot = df %>% ggplot(aes(x=time,y=response)) + geom_point() + 
    geom_line(aes(y=mean),color = "red") + geom_vline(xintercept = 36,linetype = "dashed",color = "blue",linewidth = 1.5) + ggtitle(title)
  print(post_pred_plot)
}

Grouped barplot for each error metric over 24 time points

# Create data frames for both tables
data1 <- data.frame(
  Time = 37:60,MAE,MASE,MAPE,RMSPE
)

data2 <- data.frame(
  Time = 37:60,MAE2,MASE2,MAPE2,RMSPE2
)
colnames(data2) = colnames(data1)

# Add a column to each data frame to indicate the source table
data1$Source <- 'Ref model'
data2$Source <- 'Prop model'

# Combine the two data frames
combined_data <- rbind(data1, data2)

# Melt the combined data to long format
data_long <- melt(combined_data, id.vars = c("Time", "Source"), variable.name = "Metric", value.name = "Value")

# Plot grouped bar charts using ggplot2
ggplot(data_long, aes(x = factor(Time), y = Value, fill = Source)) +
  geom_bar(stat = "identity", position = position_dodge(width = 0.9)) +
  facet_wrap(~ Metric, scales = "free_y") +
  labs(x = "Time Points", y = "Error Values", title = "") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 90, hjust = 1))

6 month sliding window forecast

In this section, I implemented a forecasting exercise in which I start with 36 months of training data. I will use that data to estimate a model and then forecast 6 months ahead. Now, we slide the time window of interest and use months 7-36 AND the newly forecasted values to re-estimate the model and forecast again to get the next 6 months. This process continues until the original 36 months of data have been used to produce a complete time series of 60 months (the last 3 years are forecasted). This exercise gives us another way to compare the predictive ability of our various models.

Reference model iteration

starting_data = inla_full_data[1:72,]
starting_data$months = as.numeric(starting_data$months)
rownames(starting_data) = NULL

while(max(starting_data$time) < 60){
  
  ###Attach df for next 6 months with NAs in response
  end = nrow(starting_data)
  id = rep(1:2,6)
  id2 = (starting_data$id2[end]+1):(starting_data$id2[end]+12)
  response = rep(NA,12)
  time = rep((starting_data$time[end]+1):(starting_data$time[end]+6),each=2)
  Intercept1 = rep(c(1,NA),6)
  Intercept2 = rep(c(NA,1),6)
  
  if (starting_data$months[end] == 6){
    months = rep(c(7,8,9,10,11,12),each=2)
  } else if (starting_data$months[end] == 12){
    months = rep(c(1,2,3,4,5,6),each=2)
  }
  
  new_data = data.frame(id,id2,response,time,months,Intercept1,Intercept2)
  starting_data = rbind(starting_data,new_data)
  starting_data$months = factor(starting_data$months)
  
  ###Fit KGR model on most recent 36 months
  starting_data_subset = starting_data[(nrow(starting_data)-83):end,]
  
  ref_formula2 = response ~ months + f(id, model = "bym", graph = huge.est) 
  ref_model2 = inla(ref_formula2,family = "poisson",data = starting_data_subset,
                  control.compute = list(dic=TRUE,waic=TRUE),
                  control.predictor = list(compute = TRUE, link = 1))
  
  ###Append ref model 2 predictions to starting data
  preds_ref_model2 = ref_model2$summary.fitted.values
  preds_ref_model2$mean = round(preds_ref_model2$mean)

  end2 = nrow(preds_ref_model2)

  pred_data = preds_ref_model2$mean[(end2-11):end2]
  starting_data$response[(end+1):(end+12)] = pred_data

  starting_data$months = as.numeric(starting_data$months)
}
#Plot of posterior predictive estimates (months 37-60) with credible interval bands OVERLAID on response
true_mortality = inla_full_data

for (i in 1:num_clus){
  df = true_mortality %>% filter(id == i) %>% select(response)
  preds = starting_data %>% filter(id == i) 
  colnames(preds)[3] = "mean"
  df = cbind(df,preds)
  
  post_pred_plot = df %>% ggplot(aes(x=time,y=response)) + geom_point() + 
    geom_line(aes(y=mean),color = "red") + geom_vline(xintercept = 36,linetype = "dashed",color = "blue",linewidth = 1.5) + ggtitle(sprintf("Sliding Timeframe Forecast for Cluster %1.0f",i))
  print(post_pred_plot)
}

Proposed model iteration

inv_covGP = kgr_model2_outfit$prec
inv_covGP3 = kgr_model3_outfit$prec
inv_covGP4 = kgr_model4_outfit$prec
inv_covGP5 = kgr_model5_outfit$prec

starting_data = inla_full_data[1:72,]
starting_data$months = as.numeric(starting_data$months)
rownames(starting_data) = NULL

while(max(starting_data$time) < 60){
  
  ###Attach df for next 6 months with NAs in response
  end = nrow(starting_data)
  id = rep(1:2,6)
  id2 = (starting_data$id2[end]+1):(starting_data$id2[end]+12)
  response = rep(NA,12)
  time = rep((starting_data$time[end]+1):(starting_data$time[end]+6),each=2)
  Intercept1 = rep(c(1,NA),6)
  Intercept2 = rep(c(NA,1),6)
  
  if (starting_data$months[end] == 6){
    months = rep(c(7,8,9,10,11,12),each=2)
  } else if (starting_data$months[end] == 12){
    months = rep(c(1,2,3,4,5,6),each=2)
  }
  
  new_data = data.frame(id,id2,response,time,months,Intercept1,Intercept2)
  starting_data = rbind(starting_data,new_data)
  starting_data$months = factor(starting_data$months)
  
  ###Fit KGR model on most recent 36 months
  starting_data_subset = starting_data[(nrow(starting_data)-83):end,]
  
  # #Proposed model 2
  # kgr_formula2 = response ~ f(id,model = "generic0",Cmatrix = inv_covGP[c(starting_data_subset$id2),c(starting_data_subset$id2)])
  # 
  # kgr_model2 = inla(kgr_formula2, data = starting_data_subset, family = "poisson",
  #                   control.predictor = list(compute = TRUE, link = 1))
  # 
  # preds_kgr_model = kgr_model2$summary.fitted.values
  
  # #Proposed model 3
  # kgr_formula3 = response ~ f(id,model = "generic0",Cmatrix = inv_covGP3[c(starting_data_subset$id2),c(starting_data_subset$id2)])
  # 
  # kgr_model3 = inla(kgr_formula3, data = starting_data_subset, family = "poisson",
  #                   control.predictor = list(compute = TRUE, link = 1))
  # 
  # preds_kgr_model = kgr_model3$summary.fitted.values
  
  # #Proposed model 4
  # kgr_formula4 = response ~ f(id,model = "generic0",Cmatrix = inv_covGP4[c(starting_data_subset$id2),c(starting_data_subset$id2)])
  # 
  # kgr_model4 = inla(kgr_formula4, data = starting_data_subset, family = "poisson",
  #                   control.predictor = list(compute = TRUE, link = 1))
  # 
  # preds_kgr_model = kgr_model4$summary.fitted.values
  
  #Proposed model 5
  kgr_formula5 = response ~ f(id,model = "generic0",Cmatrix = inv_covGP5[c(starting_data_subset$id2),c(starting_data_subset$id2)])

  kgr_model5 = inla(kgr_formula5, data = starting_data_subset, family = "poisson",
                    control.predictor = list(compute = TRUE, link = 1))

  preds_kgr_model = kgr_model5$summary.fitted.values
  
  
  ###Append KGR model predictions to starting data
  preds_kgr_model$mean = round(preds_kgr_model$mean)
  end2 = nrow(preds_kgr_model)

  pred_data = preds_kgr_model$mean[(end2-11):end2]
  starting_data$response[(end+1):(end+12)] = pred_data
  
  # starting_data$response = replace(starting_data$response,which(starting_data$response < 0),0)
  starting_data$months = as.numeric(starting_data$months)
}
#Plot of posterior predictive estimates (months 37-60) with credible interval bands OVERLAID on response
true_mortality = inla_full_data

for (i in 1:num_clus){
  df = true_mortality %>% filter(id == i) %>% select(response)
  preds = starting_data %>% filter(id == i) 
  colnames(preds)[3] = "mean"
  df = cbind(df,preds)
  
  post_pred_plot = df %>% ggplot(aes(x=time,y=response)) + geom_point() + 
    geom_line(aes(y=mean),color = "red") + geom_vline(xintercept = 36,linetype = "dashed",color = "blue",linewidth = 1.5)
  print(post_pred_plot)
}